# Lab | Customer Analysis Round 5

For this lab, we still keep using the `marketing_customer_analysis.csv` file that you can find in the `files_for_lab` folder.


### 1. Get the data

We are using the `marketing_customer_analysis.csv` file.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('files_for_lab/csv_files/marketing_customer_analysis.csv')
data

### 2. Dealing with the data

Already done in the round 2.

In [None]:
data.shape
data.info()

In [None]:
def clean_dfheaders(df):
    df.rename(columns={'Customer':'id', 'EmploymentStatus':'employment_status'}, inplace=True)
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df

# using the 2 operations together only works when removing the 'df=' infront of the first satemment. Why?
# the first operation doesnt work at all in a function without the inplace parameter. Outside of a function it does work. Why?

clean_dfheaders(data)

### 3. Explore the data

Some datasets have values that are missing, invalid, or otherwise difficult for an algorithm to process. If data is missing, the algorithm can’t use it. If data is invalid, the algorithm produces less accurate or even misleading outcomes. Some datasets are relatively clean but need to be shaped (e.g., aggregated or pivoted) and many datasets are just lacking useful business context (e.g., poorly defined ID values), hence the need for feature enrichment. Good data preparation produces clean and well-curated data which leads to more practical, accurate model outcomes.


   **3.1.- Determine the problems**
    
   **3.2.- Data cleaning** Cleaning the data is very important as the model learning from that data only, so if we feed inconsistent, appropriate data to model it will return garbage only, so it is required to make sure that the data does not contains any unseen problem.
     
       - Feature and Target Variables
       - Data Types
       - Missing data, check null values
       - Outliers
       - Duplicate data
        
   **3.3.- Feature selection** (Which variables are important to answer our questions?)
    
   **3.4.- Data transformation**

In [None]:
data['effective_to_date'] = pd.to_datetime(data['effective_to_date'], errors='coerce')

In [None]:
# No Nan values to take care of anymore, so checking if there are values  <= 0 in the numerical columns and replaceing them where it makes sense
number_col = list(data.select_dtypes(include=[np.number]).columns.values)

for col in number_col:
    neg_val = len(data[data[col] < 0])
    zero_val = len(data[data[col] == 0])
     
    if neg_val > 0:
        print('Negative values in', col, ':', neg_val)
    elif zero_val > 0:
        print('Zeros in', col, ':', zero_val)
    else:
        print('Column', col, 'is ok.')

In [None]:
# replacing 0 only makes sense in 'income'
data1 = data.copy()
data1['income'] = np.where(data['income'] == 0, data['income'].median(), data['income'])

In [None]:
data1.duplicated(subset=['id']).unique()

In [None]:
num_col = list(data1.select_dtypes(include=[np.number]).columns.values)

for col in num_col:
    plt.figure(figsize=(8,4))
    sns.boxplot(x=data1[col])
    plt.show()

In [None]:
# removing outliers where it makes sense
outl_col = ['customer_lifetime_value', 'monthly_premium_auto', 'number_of_policies', 'total_claim_amount']

for col in outl_col:
    iqr = np.percentile(data1[col],75) - np.percentile(data1[col],25)
    upper_limit = np.percentile(data1[col],75) + 1.5*iqr
    lower_limit = np.percentile(data1[col],25) - 1.5*iqr
    data1.loc[data1[col] > upper_limit, col] = upper_limit
    data1.loc[data1[col] < lower_limit, col] = lower_limit

In [None]:
data1.describe().T

In [None]:
data1_corr = data1.corr()
data1_corr

In [None]:
for col in num_col:
    plt.figure(figsize=(6,4))
    sns.distplot(data1[col])
    plt.show()

In [None]:
#no column is really promisssing, evaluated by looking at correlations with monthly_premium_auto being the best among them
#transforming except for 'number_of_open_complaints' and 'number_of_policies' because they sem to trend towards certain values which I dont want to lose by transforming and normalizing

data_t = data1.copy()
trans_col = ['customer_lifetime_value', 'income', 'monthly_premium_auto', 'total_claim_amount']
#data_t['months_since_policy_inception'] = np.where(data_t['months_since_policy_inception']<=0, 0.1, data_t['months_since_policy_inception']) doesnt work. Why?

for col in trans_col:
    transformed_col, _ci = stats.boxcox(data1[col])
    data_t[col] = transformed_col
    plt.figure(figsize=(6,4))
    sns.distplot(data_t[col])
    plt.show()

In [None]:
data_t_corr = data_t.corr()
data_t_corr

In [None]:
#since correlation for monthly_premium_auto decreased after transforming the column, I'm using the original data instead
data_t['monthly_premium_auto'] = data1['monthly_premium_auto']
data_t['total_claim_amount'] = data1['total_claim_amount']

### 4. Processing Data

(_Further processing..._)

- X-y split.
- Normalize (numerical).

In [None]:
# Model before normalizing and standardizing
t_num = list(data_t.select_dtypes(include=[np.number]).columns.values)
t_object = list(data_t.select_dtypes(include=[np.object]).columns.values)

t_drop = t_object + [t_num[7]] + ['effective_to_date']
t_x = data_t.drop(t_drop, axis=1)
t_y = data_t['total_claim_amount']

lm = LinearRegression()
model2 = lm.fit(t_x,t_y)
t_predictions = lm.predict(t_x)
t_rmse = mean_squared_error(t_y, t_predictions, squared=False)

print("R2_score:", round(lm.score(t_x,t_y),2))
print("RMSE:", t_rmse)

t_x = sm.add_constant(t_x)
model = sm.OLS(t_y,t_x).fit()

print(model.summary())

In [None]:
y = data_t['total_claim_amount']
x = data_t.drop(['total_claim_amount'], axis=1)
data_sn = x.select_dtypes(include=np.number)

transformer = Normalizer()
transformer.fit(data_sn)
x_normalized = transformer.transform(data_sn)
data_sn = pd.DataFrame(x_normalized)

transformer = StandardScaler()
transformer.fit(data_sn)
x_standardized = transformer.transform(data_sn)
data_sn = pd.DataFrame(x_standardized)

sn_col = ['customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies']

for idx, col in enumerate(sn_col):
    data_sn.rename(columns={idx:col}, inplace=True)

In [None]:
for col in sn_col:
    plt.figure(figsize=(6,4))
    sns.distplot(data_sn[col])
    plt.show()

In [None]:
# Model after normalizing and standardizing
sn_x = data_sn
sn_y = data_t['total_claim_amount']

lm_sn = LinearRegression()
model_sn2 = lm_sn.fit(sn_x,sn_y)
sn_predictions = lm_sn.predict(sn_x)
sn_rmse = mean_squared_error(sn_y, sn_predictions, squared=False)

print("R2_score:", round(lm_sn.score(sn_x,sn_y),2))
print("RMSE:", sn_rmse)

sn_x = sm.add_constant(sn_x)
model_sn = sm.OLS(sn_y,sn_x).fit()

print(model_sn.summary())