# Lab | Customer Analysis Round 6

For this lab, we still keep using the marketing_customer_analysis.csv file that you can find in the files_for_lab folder.

## Get the data

We are using the marketing_customer_analysis.csv file.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

data = pd.read_csv('files_for_lab/csv_files/marketing_customer_analysis.csv')
data

## Dealing with the data

Already done in the round 2.

In [None]:
def clean_dfheaders(df):
    df.rename(columns={'Customer':'id', 'EmploymentStatus':'employment_status'}, inplace=True)
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df

# using the 2 operations together only works when removing the 'df=' infront of the first satemment. Why?
# the first operation doesnt work at all in a function without the inplace parameter. Outside of a function it does work. Why?

clean_dfheaders(data)

In [None]:
data['effective_to_date'] = pd.to_datetime(data['effective_to_date'], errors='coerce')

## Explore the data

Done in the round 3.

## Processing Data

(Further processing...)

- X-y split. (done)
- Normalize (numerical). (done)
- One Hot/Label Encoding (categorical).
- Concat DataFrames

**X-Y Split** If you have not done it, you have you take in count that the target will be `total_claim_amount`

In [None]:
data1 = data.copy()

In [None]:
num_col = list(data1.select_dtypes(include=[np.number]).columns.values)

# removing outliers where it makes sense
outl_col1 = ['customer_lifetime_value', 'monthly_premium_auto', 'number_of_policies', 'total_claim_amount']

for col in outl_col:
    iqr = np.percentile(data1[col],75) - np.percentile(data1[col],25)
    upper_limit = np.percentile(data1[col],75) + 1.5*iqr
    lower_limit = np.percentile(data1[col],25) - 1.5*iqr
    if col == 'number_of_policies':
        data1.loc[data1[col] > upper_limit, col] = upper_limit
        data1.loc[data1[col] < lower_limit, col] = lower_limit
    else:
        data1.loc[data1[col] > upper_limit, col] = data1[col].median()
        data1.loc[data1[col] < lower_limit, col] = data1[col].median()

In [None]:
# checking for values <= 0 and replacing them before tranforming
for col in number_col:
    neg_val = len(data1[data1[col] < 0])
    zero_val = len(data1[data1[col] == 0])
     
    if neg_val > 0:
        print('Negative values in', col, ':', neg_val)
    elif zero_val > 0:
        print('Zeros in', col, ':', zero_val)
    else:
        print('Column', col, 'is ok.')

**Normalize (numerical)** If you have not done it yet, you can define a function using `StandardScaler`from sklearn library

In [None]:
data_t = data1.copy()

In [None]:
# Transform
repl_col = ['income', 'months_since_last_claim', 'months_since_policy_inception']

for col in repl_col:
    data_t[col] = np.where(data_t[col] == 0, data_t[col].median(), data_t[col])
    
trans_col = ['customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception']

for col in trans_col:
    transformed_col, _ci = stats.boxcox(data_t[col])
    data_t[col] = transformed_col
    plt.figure(figsize=(6,4))
    sns.distplot(data_t[col])
    plt.show()

In [None]:
# x-y split
t_num = list(data_t.select_dtypes(include=[np.number]).columns.values)
data_t.set_index('id', inplace=True)
t_object = list(data_t.select_dtypes(include=[np.object]).columns.values)

t_drop = t_object + [t_num[7]] + ['effective_to_date']
x_t = data_t.drop(t_drop, axis=1)
y = data_t['total_claim_amount']

In [None]:
#Normalize and Standardize
transformer = Normalizer().fit(x_t)
x_normalized = transformer.transform(x_t)
data_sn = pd.DataFrame(x_normalized)

transformer = StandardScaler().fit(data_sn)
x_standardized = transformer.transform(data_sn)
data_sn = pd.DataFrame(x_standardized)

sn_col = ['customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_last_claim', 'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies']

for idx, col in enumerate(sn_col):
    data_sn.rename(columns={idx:col}, inplace=True)

**One Hot/Label Encoding (categorical)** Try one of the two options learned in class

In [None]:
x_cat = data_t.select_dtypes(include = [np.object])

for col in t_object:
    print(x_cat[col].value_counts())

In [None]:
list(data_t.select_dtypes(include=[np.object]).columns.values)

In [None]:
x_1h = x_cat.drop(['coverage', 'education', 'vehicle_size'], axis=1)
x_label1 = x_cat['coverage']
x_label2 = x_cat['education']
x_label3 = x_cat['vehicle_size']

encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(x_1h)

encoded = encoder.transform(x_1h).toarray()
data_1h = pd.DataFrame(encoded)
# to do: data_1h.columns = encoder.categories_

le1 = LabelEncoder().fit(x_label1).transform(x_label1)
le2 = LabelEncoder().fit(x_label2).transform(x_label2)
le3 = LabelEncoder().fit(x_label3).transform(x_label3)

data_le1 = pd.DataFrame(le1)
data_le1.columns = ['coverage']
data_le2 = pd.DataFrame(le2)
data_le2.columns = ['education']
data_le3 = pd.DataFrame(le3)
data_le2.columns = ['vehicle_size']

**Concat DataFrames**

In [None]:
x = np.concatenate([data_sn, data_1h, data_le1, data_le2, data_le3], axis=1)
# losing column names

data_x = pd.DataFrame(x)
data_x

## Linear Regression

- Train-test split.
- Apply linear regression.

**Train-test split** Divide your data in a train part and a test part

In [None]:
x = data_x
y = data_t['total_claim_amount']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

**Apply linear regression** For this question you can use `statsmodels` or `sklearn` libraries

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

predictions = model.predict(x_test)

## Model Validation

- Description:
R2.
MSE.
RMSE.
MAE.

**Get R2 from the model**

In [None]:
r2 = r2_score(y_test, predictions)
print("R2:", r2)

**Get MSE from the model**

In [None]:
mse = mean_squared_error(y_test, predictions, squared=True)
print("MSE:", mse)

**Get RMSE from the model**

In [None]:
rmse = mean_squared_error(y_test, predictions, squared=False)
print("RMSE:", rmse)

**Get MAE from the model**

In [None]:
mae = mean_absolute_error(y_test, predictions)
print("MAE:", mse)