## Round 2

In [None]:
# Importing basic libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [None]:
# Creating the initial dataframe
df = pd.read_csv('./marketing_customer_analysis.csv')
df

1. Show the dataframe shape

In [None]:
df.shape

2. Standardize the header names

In [None]:
# inspect the header names
df.columns

In [None]:
# drop the index column
df=df.drop(['Unnamed: 0'], axis=1) 

In [None]:
# we have to convert it so snake format
cols = []
for column in df.columns:
    cols.append(column.lower())
df.columns = cols

In [None]:
# check if it worked
df.columns

In [None]:
# replace spaces with _
cols = []
for column in df.columns:
    cols.append(column.replace(' ','_'))
df.columns = cols

In [None]:
# check if it worked
df.columns

3. Which columns are numerical?

In [None]:
df.select_dtypes(['int32', 'float64'])
# df.select_dtypes(number)

4. Which columns are categorical?

In [None]:
df.select_dtypes(['object'])
# df.select_dtypes(object)

5. Check and deal with NaN values

In [None]:
# Checking for NaN values
df.isna().sum()

In [None]:
# We drop the rows where we have NaN for responses, because otherwise later we cant translate it to
# numericals for our later analysis
# Inserting the mode would tip the balance heavily into one direction
df = df[df['response'].isna()==False] 

In [None]:
df.isna().sum()

In [None]:
# For vehicle_type we have a value A, we assume it stands for "automatic transmission" and fill the NAN with M for "manual transmission"
df['vehicle_type'] = df['vehicle_type'].fillna('M')

In [None]:
df.isna().sum()

In [None]:
# Vehicle class and vehicle size aren't known in the same rows, we cant extrapolate anything and add anew category "unknown_size""unknown_class
df['vehicle_class'] = df['vehicle_class'].fillna('unknown_class')
df['vehicle_size'] = df['vehicle_size'].fillna('unknown_size')
df.isna().sum()

In [None]:
# For 'number_of_open_complaints' we fill with the mode because nearly 80% have 0.0 open complaints
df['number_of_open_complaints'].value_counts()

In [None]:
df['number_of_open_complaints'] = df['number_of_open_complaints'].fillna(0.0)

In [None]:
# For 'months_since_last_claim' we fill with the mean
df['months_since_last_claim'].value_counts()
mslc= df['months_since_last_claim'].mean()
df['months_since_last_claim'] = df['months_since_last_claim'].fillna(mslc)

In [None]:
# All NULL values are dealt with
df.isna().sum()

6. Datetime format - Extract the months from the dataset and store in a separate column. Then filter the data to show only the information for the first quarter , ie. January, February and March. Hint: If data from March does not exist, consider only January and February.

In [None]:
# Extract the month into a new column
import time
from datetime import date

In [None]:
df['month'] = pd.to_datetime(df['effective_to_date'], errors='coerce')

In [None]:
df['month'] = pd.DatetimeIndex(df['month']).month
#df['month'] = pd.DatetimeIndex(df['month']).dt.month

In [None]:
df['month'].unique()

In [None]:
# There is already just data for january and february in the source file

## Bonus put the data cleaning process into a function

In [None]:
def ca_clean(df):
    import pandas as pd
    import numpy as np
    import time
    from datetime import date
    # drop the index column
    df=df.drop(['Unnamed: 0'], axis=1) 
    cols = []
    # snakecase
    for column in df.columns:
        cols.append(column.lower())
    df.columns = cols
    cols = []
    for column in df.columns:
        cols.append(column.replace(' ','_'))
    df.columns = cols
    # dropping NAN in response
    df = df[df['response'].isna()==False]
    # converting NAN to 'M' for vehicle type
    df['vehicle_type'] = df['vehicle_type'].fillna('M')
    # filling NAN for vehicle class and vehicle size
    df['vehicle_class'] = df['vehicle_class'].fillna('unknown_class')
    df['vehicle_size'] = df['vehicle_size'].fillna('unknown_size')
    # For 'months_since_last_claim' we fill with the mean
    df['months_since_last_claim'].value_counts()
    mslc= df['months_since_last_claim'].mean()
    df['months_since_last_claim'] = df['months_since_last_claim'].fillna(int(round(mslc)))
    # Filling number of complaints with 0 as the most common value
    df['number_of_open_complaints'] = df['number_of_open_complaints'].fillna(0)
    # Getting the months from the dates
    df['month'] = pd.to_datetime(df['effective_to_date'], errors='coerce')
    df['month'] = pd.DatetimeIndex(df['month']).month
    # Dropping the 'effective_to_date' columns
    df=df.drop(['effective_to_date'], axis=1) 
    
    return df

In [None]:
# Trying out the function:
df2 = pd.read_csv('./marketing_customer_analysis.csv')
df3 = ca_clean(df2)
print(df2.shape,df3.shape)
print(df3.columns)
print(df3.isna().sum())

## Round 3

In [None]:
# 1. Show DataFrame info

In [None]:
df.info()

In [None]:
# 2. Describe the DataFrame

# For categorical data
df.describe(include=[np.object]).T

In [None]:
# For numerical data
df.describe()

In [None]:
# For the following analysis we convert the responses to a numerical value, 'Yes' = 1, 'No' = 0
df['response_rate'] = df['response'].apply(lambda x : 1 if x == 'Yes' else 0)
#df['response'].map({'Yes':1,'No':0}) also possible

In [None]:
# Importing needed libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 3. Show a plot of the total number of responses
sns.countplot(x='response_rate', data=df)
plt.ylabel('Total number of responses')
plt.show()

In [None]:
# 4. Show a plot of the response rate by the sales channel
sns.barplot(data=df, x='sales_channel', y='response_rate', ci = None)
plt.show()

In [None]:
# # 5.Show a plot of the response rate by the total claim amount
# We get too many Bars if we directly create a plot, so we first have to
# get a new structure into the Data, we can sort it by quantiles.
df['quantile_claim_amount'] = pd.qcut(df['total_claim_amount'], 8, labels=False, duplicates = 'drop')
sns.barplot(data=df, x='quantile_claim_amount', y='response_rate', estimator= np.mean, ci=None)
plt.show()

In [None]:
# 6. Show a plot of the response rate by income
df['quantile_income'] = pd.qcut(df['income'], 12, labels=False, duplicates = 'drop')
sns.barplot(data=df, x='quantile_income', y='response_rate', estimator= np.mean, ci=None)
plt.show()
# a lower number of bins shows a quite uniform distribution

## Round 4

1. Check the data types of the columns. Get the numeric data into dataframe called numerical and categorical columns in a dataframe called categoricals. (You can use np.number and np.object to select the numerical data types and categorical data types respectively)

In [None]:
df.dtypes

In [None]:
numerical = df.select_dtypes('number')
# for better visibility we drop some values we dont need, like 'month', 'quntile_claim_amount','quantile_income', 'response_rate'
# we created them for analytical purposes earlier
numerical = numerical.drop(['month', 'quantile_claim_amount','quantile_income','response_rate'], axis=1)
numerical.columns

In [None]:
categoricals = df.select_dtypes('object')
categoricals.columns

Now we will try to check the normality of the numerical variables visually

Use seaborn library to construct distribution plots for the numerical variables

In [None]:
import warnings
warnings.filterwarnings('ignore')

for i in numerical.columns:
    sns.displot(numerical[i], bins = 100)
    plt.show()

In [None]:
for i in numerical.columns:
    plt.hist(numerical[i])
    plt.title(i)
    plt.show()

Do the distributions for different numerical variables look like a normal distribution

In [None]:
# Few charts look like normal distributions, those that do are:
# Income (apart from the 0 value outliers)
# months_since_policy_inception (thogh very flat)

Drop one of the two features that show a high correlation between them (greater than 0.9). Write code for both the correlation matrix and for seaborn heatmap. If there is no pair of features that have a high correlation, then do not drop any features

In [None]:
# Correlation matrix
correlations_matrix = numerical.corr()
correlations_matrix

In [None]:
# Heat map
plt.figure(figsize = (16,5))
sns.heatmap(correlations_matrix, annot=True)
plt.show()
# There is nothing to drop according to the task, no correlation is anywhere near 0.9

## Round 5

X-y splt

In [None]:
# Setting y to our target
y = df['total_claim_amount']
# Putting everything else to x
X = df.drop(['total_claim_amount'], axis=1)

In [None]:
# Splitting X into numericals and categoricals
X_num = X.select_dtypes('number')
X_cat = X.select_dtypes('object')
print(X_num.columns)
print(X_cat.columns)

Normalize (numerical)

First we do the MinMaxScaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
# Fitting the data in our transformer
transformer = MinMaxScaler().fit(X_num)
X_normalized = transformer.transform(X_num)

In [None]:
# To visualize as a dataframe
pd.DataFrame(X_normalized, columns=X_num.columns)

Now the standard Scaler

In [None]:
# We again fit our data
transformer = StandardScaler().fit(X_num)
x_standardized2 = transformer.transform(X_num)

In [None]:
# Check if it worked
pd.DataFrame(x_standardized2, columns=X_num.columns)

# Round 6

In [None]:
# We drop some columns of the categoricals because they don't deliver any value
# 'customer' are all unique values that dont generate a pattern
# 'response' because we have response rate in the numericals
# 'effective_to_date' because we use the months
X_cat = X_cat.drop(['customer','response','effective_to_date'], axis = 1)

In [None]:
# For X_num we drop the quentiles we calculated, becaus ethat is already represented in the original datas
X_num = X_num.drop(['quantile_income','quantile_claim_amount'], axis = 1)

In [None]:
# Normaliziong again with the dropped columns
transformer = MinMaxScaler().fit(X_num)
X_normalized = transformer.transform(X_num)

## OneHot Encoding

In [None]:
# One Hot/Label Encoding categoricals
from sklearn.preprocessing import OneHotEncoder

In [None]:
# OneHotEncoding of the categorical values
encoder = OneHotEncoder(drop='first').fit(X_cat)
print(encoder.categories_)

In [None]:
# OneHotEncoding of the categorical values
encoded = encoder.transform(X_cat).toarray()
encoded.shape

In [None]:
# Concatenating the normalized and encoded data
# Putting the encoded data into a dataframe to make it possible to concatenate
onehot_encoded = pd.DataFrame(encoded)
X_normalized = pd.DataFrame(X_normalized)

In [None]:
X = pd.concat([X_normalized, onehot_encoded], axis=1)
# See if it worked
X

## Linear Regression

In [None]:
# Train-test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Checking the results
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Training the Linear Regression model with our data
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

## Model Validation

First we validate the performance on the training data

In [None]:
# Checking for the R2-Score
from sklearn.metrics import r2_score, mean_absolute_error
predictions = lm.predict(X_train)
r2_score(y_train, predictions)
# We achieve an R2-Score of 0.86 for our training data

In [None]:
# Checking for Mean Square Error
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_train,predictions)
mse

In [None]:
# The Root Mean Square Error is just the root of above
rmse = np.sqrt(mse)
rmse

In [None]:
# Lastly the Mean Absolute Error
mae = mean_absolute_error(y_train, predictions)
print(mae)

Now we test for performance on the testing data

In [None]:
# Checking for the R2-Score
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)
# R2S went from 0.86 to 0.82

In [None]:
# Checking for Mean Square Error
from sklearn.metrics import mean_squared_error
mse_test=mean_squared_error(y_test, predictions_test)
mse_test
# MSE went from about 12.000 to about 15.500

In [None]:
# The Root Mean Square Error is just the root of above
rmse_test = np.sqrt(mse_test)
rmse_test
# RMSE went from about 110 to about 125

In [None]:
# Lastly the Mean Absolute Error
mae_test = mean_absolute_error(y_test, predictions_test)
print(mae_test)
# MAE went from about 65 to about 68

# Round 7

To make things easier looking forward I summarize the modeling and validation process into a function

In [None]:
# The function takes (a dataframe, a target column name, a float for the test_size between 0 and 1)
# It does:
# X-y split
# Num-Cat split for X
# Normalization using MinMax
# OneHotEncoding for Categroricals
# Concatenation
# Creating and training a linear regression model
# Model Validation
# It outputs: (liner_model, a dataframe containing the valuation metrics for the training and test data)

def linear_automodel_MM(df, target, ts):
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import OneHotEncoder
    # X-y split
    y = df[target]
    X = df.drop([target], axis=1)
    # Num-Cat split
    X_num = X.select_dtypes('number')
    X_cat = X.select_dtypes('object')
    # MinMaxScaling
    transformer = MinMaxScaler().fit(X_num)
    X_normalized = transformer.transform(X_num)
    
    # OneHotEncoding
    encoder = OneHotEncoder(drop='first').fit(X_cat)
    encoded = encoder.transform(X_cat).toarray()
    # Putting into dataframes
    onehot_encoded = pd.DataFrame(encoded)
    X_normalized = pd.DataFrame(X_normalized)
    X = pd.concat([X_normalized, onehot_encoded], axis=1)
    # Creating the linear regression model
    # Train-test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)
    # Training the Linear Regression model with our data
    from sklearn import linear_model
    lm = linear_model.LinearRegression()
    lm.fit(X_train,y_train)
    
    # Validating Model
    # Training Data
    # R2-Score
    from sklearn.metrics import r2_score, mean_absolute_error
    predictions = lm.predict(X_train)
    r2score = r2_score(y_train, predictions)
    
    # Mean Square Error
    from sklearn.metrics import mean_squared_error
    mse=mean_squared_error(y_train,predictions)
    
    # Root Mean Square Error
    rmse = np.sqrt(mse)
    
    # Mean Absolute Error
    mae = mean_absolute_error(y_train, predictions)
    
    # Test Data
    
    # R2-Score
    predictions = lm.predict(X_test)
    r2scoret = r2_score(y_test, predictions)
    
    # Mean Square Error
    mset=mean_squared_error(y_test,predictions)
    
    # Root Mean Square Error
    rmset = np.sqrt(mse)
    
    # Mean Absolute Error
    maet = mean_absolute_error(y_test, predictions)
    
    # Creating the output dataframe
    df_val = pd.DataFrame({'ValType': ['Train', 'Test'], 'R2-Score': [r2score, r2scoret], 'MSE': [mse, mset] , 'RMSE': [rmse, rmset], 'MAE': [mae, maet]})
    
    # returning the model and the validation
    return lm, df_val

In [None]:
# Same as above but with StandardScaling
def linear_automodel_SS(df, target, ts):
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import OneHotEncoder
    # X-y split
    y = df[target]
    X = df.drop([target], axis=1)
    # Num-Cat split
    X_num = X.select_dtypes('number')
    X_cat = X.select_dtypes('object')
    # StandardScaling
    transformer = StandardScaler().fit(X_num)
    X_normalized = transformer.transform(X_num)
    
    # OneHotEncoding
    encoder = OneHotEncoder(drop='first').fit(X_cat)
    encoded = encoder.transform(X_cat).toarray()
    # Putting into dataframes
    onehot_encoded = pd.DataFrame(encoded)
    X_normalized = pd.DataFrame(X_normalized)
    X = pd.concat([X_normalized, onehot_encoded], axis=1)
    # Creating the linear regression model
    # Train-test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=42)
    # Training the Linear Regression model with our data
    from sklearn import linear_model
    lm = linear_model.LinearRegression()
    lm.fit(X_train,y_train)
    
    # Validating Model
    # Training Data
    # R2-Score
    from sklearn.metrics import r2_score, mean_absolute_error
    predictions = lm.predict(X_train)
    r2score = r2_score(y_train, predictions)
    
    # Mean Square Error
    from sklearn.metrics import mean_squared_error
    mse=mean_squared_error(y_train,predictions)
    
    # Root Mean Square Error
    rmse = np.sqrt(mse)
    
    # Mean Absolute Error
    mae = mean_absolute_error(y_train, predictions)
    
    # Test Data
    
    # R2-Score
    predictions = lm.predict(X_test)
    r2scoret = r2_score(y_test, predictions)
    
    # Mean Square Error
    mset=mean_squared_error(y_test,predictions)
    
    # Root Mean Square Error
    rmset = np.sqrt(mse)
    
    # Mean Absolute Error
    maet = mean_absolute_error(y_test, predictions)
    
    # Creating the output dataframe
    df_val = pd.DataFrame({'ValType': ['Train', 'Test'], 'R2-Score': [r2score, r2scoret], 'MSE': [mse, mset] , 'RMSE': [rmse, rmset], 'MAE': [mae, maet]})
    
    # returning the model and the validation
    return lm, df_val

# Actual start of round 7

In [None]:
# recreating our baseline from round 6
df2 = df.drop(['customer','response','effective_to_date'], axis = 1)
df2 = df2.drop(['quantile_income','quantile_claim_amount'], axis = 1)
lm, validation = linear_automodel_MM(df2,'total_claim_amount',0.2)
validation

Trying out the same but StandardScaling instead of MinMax

In [None]:
lm, validation = linear_automodel_SS(df2,'total_claim_amount',0.2)
validation
# Our results are a little worse, but in an insignificant matter

### We try our model with different ratios of train test split

In [None]:
lm, validation = linear_automodel_MM(df2,'total_claim_amount',0.1)
validation

In [None]:
lm, validation = linear_automodel_MM(df2,'total_claim_amount',0.3)
validation

In [None]:
lm, validation = linear_automodel_MM(df2,'total_claim_amount',0.5)
validation

In [None]:
lm, validation = linear_automodel_MM(df2,'total_claim_amount',0.8)
validation

For lower training sizes we see improving performance on the training data.
For the test data the improvement stops roughly at a 50/50 split, for our sample size this seems to be the best we can get for now.

### We remove some outliers in the data for income and total_claim_amount

In [None]:
# We see a large number of 0 for income, which doesn't make sense, we will remiove them
# Since income has a noticeable correlation "-0.35" with our target, I decided against inserting a value into this column and
# opted for the nuclear option

In [None]:
dfo = df2[(df['income']>0)].copy()
dfo.shape
# Check if it worked

Testing our model with the new dataframe

In [None]:
lm, validation = linear_automodel_SS(dfo,'total_claim_amount',0.3)
validation
# Conclusion: Our validation metrics significantly improved, we will continue with the dfo dataframe

In [None]:
# Now we remove the outliers for total_claim_amount which start at above 2500
dfo = dfo[(df['total_claim_amount']<2500)].copy()
dfo.shape

In [None]:
lm, validation = linear_automodel_SS(dfo,'total_claim_amount',0.3)
validation
# We only lost 3 rows but noticeably improved the model

### Use the transformation on numerical columns which align it more towards a normal distribution

Log10 transformation

In [None]:
# We will convert the numerical data with a log10 transformation and see the impact
# We use this function
def log_transfom_clean(x):
    x = np.log10(x)
    if np.isfinite(x):
        return x
    else:
        return 0

In [None]:
dfol = dfo.copy()

In [None]:
# We apply the log transformastion to all numerical columns
for c in dfol.columns:
    if c in dfol.select_dtypes(['number']):
        dfol[c] = dfol[c].apply(log_transfom_clean)

In [None]:
# Inspecting the result
lm, validation = linear_automodel_SS(dfol,'total_claim_amount',0.2)
validation
# Our R2 deteriorated massively.
# All other metrics decreased by several magnitudes. This is obviously partially due to the smaller numbers after the transformation.
# We will consider the dfo and dfol dataframes

 ## Removing insignificant variables

We try out what effect the removal of different columns has on our model:

In [None]:
# We drop each column once and check the results
dfod = dfo.copy()
for c in dfod.columns:
    if c != 'total_claim_amount':
        dtest = dfod.copy()
        dtest = dtest.drop([c], axis=1)
        lm, validation = linear_automodel_SS(dtest,'total_claim_amount',0.3)
        print('Results for dropping ' + c)
        display(validation)    

In [None]:
# The labels with the highest impact are:
# 'location code'
# 'monthly_premium_auto'

# We try our model just with these columns:
dfreduced = dfo[['location_code','monthly_premium_auto','total_claim_amount']].copy()
lm, validationf = linear_automodel_SS(dfreduced,'total_claim_amount',0.3)
validationf

In [None]:
# And with our log10 adjusted data:
dfreduced = dfol[['location_code','monthly_premium_auto','total_claim_amount']].copy()
lm, validationf2 = linear_automodel_SS(dfreduced,'total_claim_amount',0.3)
validationf2

In [None]:
# With just tweo labels left there is no need to search for further multicollinearity
# Trying out the 50/50 train test split from earlier doesn't imrpove our results anymore.

In [None]:
# The best I can achieve are models that:
# Take only the labels 'location_code' and 'monthly_premium_auto'
# They require very little information and return passable results!
# When using new data we can see if the lowered MSE RMSE and MAE from the log10 transformation outweigh the R2-Score

## Bonus: Build a function for round 2 and round 7 processing the data

In [None]:
# We use our function from round 2

In [None]:
def dataproc(df):
    # Cleaning the data
    df = ca_clean(df)
    # Removing outliers
    df = df[(df['income']>0)]
    df = df[(df['total_claim_amount']<2500)]
    # Removing unneccesary columns
    ## Keeping for eventual later use: df = df.drop(['customer','response','effective_to_date'], axis = 1)
    df = df[['location_code','monthly_premium_auto','total_claim_amount']]
    # log10 transformation
    def log_transfom_clean_intern(x):
        x = np.log10(x)
        if np.isfinite(x):
            return x
        else:
            return 0
    for c in df.columns:
        if c in df.select_dtypes(['number']):
            df[c] = df[c].apply(log_transfom_clean_intern)
    return df

In [None]:
# Trying out our functions:
testframe = pd.read_csv('./marketing_customer_analysis.csv')
lmtest, validationtest = linear_automodel_SS(dataproc(testframe),'total_claim_amount',0.3)
validationtest

# Looks identical to our original results

# Final round

### Further data exploration

In [None]:
sns.pairplot(numerical)
# By visual inspection we see the same correlations as when building the model

In [None]:
# Checking for effects in the categorical data
for i in ['state', 'coverage', 'education', 'employmentstatus', 'gender', 'location_code',
       'marital_status', 'policy_type', 'policy', 'renew_offer_type',
       'sales_channel', 'vehicle_class', 'vehicle_size', 'vehicle_type']:
    sns.barplot(data=df, x= i, y='total_claim_amount', estimator= np.mean, ci=None)
    plt.show()

# Results

### Data quality

There seems to be a problem with the data collection for the transmission type. All data for manual transmission contained NULL values and had to be replaced.
Also a lot of incomes are registered as 0, which does not make any sense and implies wrong entries.

### Very few correlations with other numerical data

We found that in our dataset our target had very few correlations with other variables in the numerical Data, the only relevant factors seem to be the location code and the monthly premium.

When building our model the dropping of any of the categorical columns did not yield in diminishing returns.

### Model

With only two categories of our dataset we were able to produce a satisfactory model with the following metrics:

In [None]:
display('With normalized data:', validationf, 'With log10 transformed and normalized data:',validationf2)

### Other factors

We see that the total claim amount is higher for higher coverages, what is not surprising.

In [None]:
sns.barplot(data=df, x= 'coverage', y='total_claim_amount', estimator= np.mean, ci=None)

We see diminishing claim amounts for higher education individuals.

In [None]:
sns.barplot(data=df, x= 'education', y='total_claim_amount', estimator= np.mean, ci=None)

Employed customers have less claim amounts then unemployed.

In [None]:
sns.barplot(data=df, x= 'employmentstatus', y='total_claim_amount', estimator= np.mean, ci=None)

Claim amounts for male customers are slightly higher than for females.

In [None]:
sns.barplot(data=df, x= 'gender', y='total_claim_amount', estimator= np.mean, ci=None)

We see a the highest average claim for suburban customers. Rural customers are having the smallest claims.
The differences here are huge, with Suburban averages being more than 4 times the Rural averages.

In [None]:
sns.barplot(data=df, x= 'location_code', y='total_claim_amount', estimator= np.mean, ci=None)
print(df['location_code'].value_counts())

We see higher claim amounts for single customers in relation to married or divorced ones.

In [None]:
sns.barplot(data=df, x= 'marital_status', y='total_claim_amount', estimator= np.mean, ci=None)

We see spikes in the claim amounts for luxury cars and luxury SUVs

In [None]:
ax = sns.barplot(data=df, x= 'vehicle_class', y='total_claim_amount', estimator= np.mean, ci=None)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()

## Conclusion

More than 50% of the customers reside in suburban areas which tend to vastly higher claim amounts.

Actions in marketing to target rural population would result in a customer base with significantly lower claims.
This could considerably improve financial performance.