In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [2]:
df= pd.read_csv('marketing_customer_analysis.csv')

In [3]:
df

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.431650,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,23405.987980,No,Basic,Bachelor,2/10/11,Employed,M,71941,...,89,0,2,Personal Auto,Personal L1,Offer2,Web,198.234764,Four-Door Car,Medsize
9130,PK87824,California,3096.511217,Yes,Extended,College,2/12/11,Employed,F,21604,...,28,0,1,Corporate Auto,Corporate L3,Offer1,Branch,379.200000,Four-Door Car,Medsize
9131,TD14365,California,8163.890428,No,Extended,Bachelor,2/6/11,Unemployed,M,0,...,37,3,2,Corporate Auto,Corporate L2,Offer1,Branch,790.784983,Four-Door Car,Medsize
9132,UP19263,California,7524.442436,No,Extended,College,2/3/11,Employed,M,21941,...,3,0,3,Personal Auto,Personal L2,Offer3,Branch,691.200000,Four-Door Car,Large


In [4]:
df.columns= df.columns.str.lower()
df.columns= df.columns.str.replace(' ', '_', regex= False)

In [5]:
#- Select the columns which are correlated with `total_claim_amount` and don't suffer from multicollinearity (see the previous lab)
#- Remove outliers
#- X-y split. (define which column you want to predict, and which ones you will use to make the prediction)
#- Use the [Train-test split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split) to create the Train, and Test sets (make sure to set the `random_state` option to any integer number of your choice).
#- Use the [pd.DataFrame()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) function to create new Pandas DataFrames from the X_train, and X_test Numpy arrays obtained in the previous step (make sure to use the `columns=` option to set the columns names to `X.columns`).
#- Split the `X_train` Pandas DataFrame into two: `numerical`, and `categorical` using `df.select_dtypes()`.

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
selected_columns= ['customer_lifetime_value', 'income', 'monthly_premium_auto', 'total_claim_amount']
df_selected= df[selected_columns]

In [8]:
Q1= df_selected.quantile(0.25)
Q3= df_selected.quantile(0.75)
IQR= Q3 - Q1

df_no_outliers = df_selected[~((df_selected < (Q1 - 1.5 * IQR)) | (df_selected > (Q3 + 1.5 * IQR))).any(axis=1)]
print("Before removing outliers:", df_selected.shape)
print("After removing outliers:", df_no_outliers.shape)

Before removing outliers: (9134, 4)
After removing outliers: (7847, 4)


In [9]:
X = df_no_outliers.drop('total_claim_amount', axis=1)
y = df_no_outliers['total_claim_amount']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (6277, 3)
X_test shape: (1570, 3)
y_train shape: (6277,)
y_test shape: (1570,)


In [10]:
#- If you need to transform any column, Train your transformers and/or scalers all the `numerical` columns using the `.fit()` **only in the Train** set (only one transformer/scaler for all the columns, check [here](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer), and [here](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler) using the `.transform()` 
#- Save all your transformers/scalers right after the `.fit()` using `pickle` using the code shown below:
 # ```Python
  #import os
  
  #path = "transformers/"
  # Check whether the specified path exists or not
  #isExist = os.path.exists(path)
  #if not isExist:
   #   # Create a new directory because it does not exist
    #  os.makedirs(path)
    # print("The new directory is created!")
 
 # filename = "filename.pkl" # Use a descriptive name for your scaler/transformer but keep the ".pkl" file extension
  #with open(path+filename, "wb") as file:
   # pickle.dump(variable, file) # Replace "variable" with the name of the variable that contains your transformer
  #```
#- If you used a transformer/scaler in the previous step, create new Pandas DataFrames from the Numpy arrays generated by the `.transform()` using the `pd.DataFrame()` function as you did earlier with the Numpy arrays generated by the `train_test_split()` function.
#- Transform the `categorical` columns into numbers using a:
 # - [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder) for categorical **nominal** columns. (again **only use the `.fit()` in the Train set**, but the .`transform()` in the Train and the Test sets)
  #- Remember to save all your transformers/scalers right after the `.fit()` using `pickle` using the code shown below:
   # ```Python
    #path = "encoders/"
    ## Check whether the specified path exists or not
    #isExist = os.path.exists(path)
    #if not isExist:
      # Create a new directory because it does not exist
     # os.makedirs(path)
      #print("The new directory is created!")
 
    #filename = "filename.pkl" # use a descriptive name for your encoder but keep the ".pkl" file extension
    #with open(path+filename, "wb") as file:
     #  pickle.dump(variable, file) # Replace "variable" with the name of the variable that contains your transformer
    #```
  #- Use `.replace()` to cast into numbers any categorical **ordinal** column replacing each label with a number that: respects the order of the labels and the relative "distance"
#- Concat `numerical_transformer` and `categorical_transfomed` DataFrames using `pd.concat()`.
#- Apply another MinMaxScaler to the concatenated DataFrame.
#- Remember to save all your MinMaxScaler right after the `.fit()` using `pickle` using the code shown below:
 #   ```Python
  #  path = "scalers/"
   # # Check whether the specified path exists or not
   # isExist = os.path.exists(path)
   # if not isExist:
      # Create a new directory because it does not exist
    #  os.makedirs(path)
     # print("The new directory is created!")
 
   # filename = "filename.pkl" # use a descriptive name for your encoder but keep the ".pkl" file extension
   # with open(path+filename, "wb") as file:
    #   pickle.dump(variable, file) # Replace "variable" with the name of the variable that contains your transformer
    #```
#- Apply linear regression to the Pandas DataFrame obtained in the previous step using [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression)
#- Remember to save your linear model right after the `.fit()` using `pickle` using the code shown below:
 # ```Python
  #    path = "models/"
   #   # Check whether the specified path exists or not
    #  isExist = os.path.exists(path)
     # if not isExist:
      #  # Create a new directory because it does not exist
       # os.makedirs(path)
       # print("The new directory is created!")
 
       #filename = "filename.pkl" # use a descriptive name for your encoder but keep the ".pkl" file extension
       #with open(path+filename, "wb") as file:
        #  pickle.dump(variable, file) # Replace "variable" with the name of the variable that contains your transformer
   # ```

In [11]:
from sklearn.preprocessing import PowerTransformer, MinMaxScaler
import pickle
import os

X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

numerical_cols = X_train_df.select_dtypes(include=['int', 'float']).columns
numerical = X_train_df[numerical_cols]

power_transformer = PowerTransformer()
numerical_transformed = power_transformer.fit_transform(numerical)

path = "transformers/"
os.makedirs(path, exist_ok=True)
with open(os.path.join(path, "power_transformer.pkl"), "wb") as file:
    pickle.dump(power_transformer, file)

minmax_scaler = MinMaxScaler()
numerical_transformed = minmax_scaler.fit_transform(numerical_transformed)

with open(os.path.join(path, "minmax_scaler.pkl"), "wb") as file:
    pickle.dump(minmax_scaler, file)

In [12]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = X_train_df.select_dtypes(include=['object']).columns
categorical = X_train_df[categorical_cols]

onehot_encoder = OneHotEncoder()
categorical_transformed = onehot_encoder.fit_transform(categorical).toarray()

path = "encoders/"
os.makedirs(path, exist_ok=True)
with open(os.path.join(path, "onehot_encoder.pkl"), "wb") as file:
    pickle.dump(onehot_encoder, file)

In [13]:
numerical_transformed_df = pd.DataFrame(numerical_transformed, columns=numerical.columns)
categorical_transformed_df = pd.DataFrame(categorical_transformed, columns=onehot_encoder.get_feature_names_out())

X_transformed = pd.concat([numerical_transformed_df, categorical_transformed_df], axis=1)

In [14]:
model = LinearRegression()
model.fit(X_transformed, y_train)

models_dir = "models"
os.makedirs(models_dir, exist_ok=True)

model_path = os.path.join(models_dir, "linear_model.pkl")
with open(model_path, "wb") as file:
    pickle.dump(model, file)

In [15]:
### Model Validation

#- Compute the following metrics for your Train and Test sets:
 # - [R2](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score).
 # - [MSE](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn-metrics-mean-squared-error).
 #- [RMSE](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn-metrics-mean-squared-error)
 # - [MAE](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn-metrics-mean-absolute-error).

#- Create a Pandas DataFrame to summarize the error metrics for the Train and Test sets.

In [16]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# Compute metrics
train_predictions = model.predict(X_transformed)
test_predictions = model.predict(X_test)  # Apply similar transformations to X_test

train_r2 = r2_score(y_train, train_predictions)
train_mse = mean_squared_error(y_train, train_predictions)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, train_predictions)

test_r2 = r2_score(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, test_predictions)

# Summarizing the metrics
metrics_summary = pd.DataFrame({
    'Metric': ['R2', 'MSE', 'RMSE', 'MAE'],
    'Train': [train_r2, train_mse, train_rmse, train_mae],
    'Test': [test_r2, test_mse, test_rmse, test_mae]
})

print(metrics_summary)


  Metric         Train          Test
0     R2      0.344393 -2.922812e+09
1    MSE  27514.329417  1.214950e+14
2   RMSE    165.874439  1.102248e+07
3    MAE    129.617465  8.636716e+06
