# Loading Libraries

In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load Data

In [4]:
df = pd.read_csv("Data_Marketing_Customer_Analysis_Round3.csv")

# Defining X, y

In [5]:
df = df.drop(columns=["effective_to_date"], axis = 1)

In [6]:
df = df.select_dtypes(include=np.number)

In [7]:
X  =  df.drop(columns=["total_claim_amount"])

In [8]:
y = df.total_claim_amount

In [9]:
X

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,4809,48029,61,7,52,0,9
1,2228,92260,64,3,26,0,1
2,14947,22139,100,34,31,0,2
3,22332,49078,97,10,3,0,2
4,9025,23675,117,33,31,0,7
...,...,...,...,...,...,...,...
10684,15563,61541,253,12,40,0,7
10685,5259,61146,65,7,68,0,6
10686,23893,39837,201,11,63,0,2
10687,11971,64195,158,0,27,4,6


# Data Splitting

#### Doing X-y split (y is the target variable, which is the total claim amount)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [11]:
X_train.describe()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
count,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0,8551.0
mean,7994.902701,51817.509063,93.295287,15.13554,48.19296,0.375395,2.983511
std,6848.846659,24717.379264,34.575537,10.13316,27.849503,0.899706,2.398456
min,1898.0,10074.0,61.0,0.0,0.0,0.0,1.0
25%,4020.5,29435.0,68.0,6.0,25.0,0.0,1.0
50%,5764.0,50446.0,83.0,14.0,48.0,0.0,2.0
75%,8964.0,72194.5,109.0,23.0,71.0,0.0,4.0
max,74228.0,99981.0,298.0,35.0,99.0,5.0,9.0


# Variance threshold method

#### --> is an Univariate method

In [12]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features


X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(threshold=100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (8551, 7)

Final number of numerical columns:  (8551, 5)



Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception
0,21423,22379,65,9,31
1,8391,40211,106,5,98
2,3969,49544,101,3,29
3,14914,45963,63,3,73
4,18060,57882,115,1,61
...,...,...,...,...,...
8546,7610,98701,94,22,66
8547,35186,86134,98,17,78
8548,4241,19834,64,26,8
8549,12941,77060,106,23,90


In [13]:
# number of open complaints and number of policies were dropped

# Correlation matrix

#### --> is an Univariate method

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

c = abs(df.corr())
#c

#fig, ax = plt.subplots(figsize=(14,14))
#sns.heatmap(c, annot=True);

#c['SalePrice']
c_last = c["total_claim_amount"].sort_values(ascending=False)
#c_last
c_thr = .3
cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(cols_to_keep)

df[cols_to_keep]

['monthly_premium_auto', 'total_claim_amount']


Unnamed: 0,monthly_premium_auto,total_claim_amount
0,61,292
1,64,744
2,100,480
3,97,484
4,117,707
...,...,...
10684,253,1214
10685,65,273
10686,201,381
10687,158,618


# Recursive feature elimination

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE  ## recursive feature elemination technique

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
#cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 5, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Fitting estimator with 7 features.
Fitting estimator with 6 features.
Final selected features: 


Unnamed: 0,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,65,9,31,0,2
1,106,5,98,2,6
2,101,3,29,0,1
3,63,3,73,2,2
4,115,1,61,0,2
...,...,...,...,...,...
8546,94,22,66,0,3
8547,98,17,78,0,2
8548,64,26,8,4,8
8549,106,23,90,0,2


# Embedded methods

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [27]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train=imp_mean.fit_transform(X_train)
X_test = imp_mean.fit_transform(X_test)

# OLS

In [28]:
model=LinearRegression()
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

LinearRegression: Train -> 0.4086926440650056, Test -> 0.41140062170554115


In [29]:
# R² Test > R² Train, model generalizes well

### Lasso

In [30]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso(alpha=0.05)

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Lasso: Train -> 0.4086926032425867, Test -> 0.41141620918570554


### Ridge

In [31]:
model=Ridge(alpha=10000)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

Ridge: Train -> 0.40867692596088767, Test -> 0.41168523050262207


### ElasticNet

In [32]:
model=ElasticNet(alpha=0.1)
model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")

ElasticNet: Train -> 0.40869238511331907, Test -> 0.4114386909519817
