In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import OneHotEncoder  ##. better to use dummy from pandas 
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox ## z-transformation // standard scaler 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import boxcox
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
## Install xlrd package to load Excel files
# conda install openpyxl
## conda install xlrd

# Load data 

In [None]:
ca_df= pd.read_csv ("data/Data_Marketing_Customer_Analysis_Round3.csv")

In [None]:
ca_df

In [None]:
ca_df.info()

# Cleaning data 

## All those steps where did before 
- 1 Standardize column names
- 2 Deleting and rearranging columns
- 3 Working with data types (set the correct type)
- 4 Filtering data
- 5 Removing duplicates
- 6 Correcting typos
- 7 Conditional formatting
- 8 Replace missing values


# Split df to numerical and categorical features 

In [None]:
ca_numerical=ca_df._get_numeric_data()
ca_categorical=ca_df.select_dtypes(include=["object"])

ca_numerical

In [None]:
ca_categorical

In [None]:
ca_numerical.drop(columns=["number_of_open_complaints"],axis=1, inplace=True)
ca_numerical

In [None]:
l= ca_df["number_of_open_complaints"]
ca_categorical.insert(0, "number_of_open_complaints",l)

In [None]:
ca_categorical.drop(columns=["effective_to_date"],axis=1, inplace=True)

In [None]:
ca_categorical

# Hot / Labeld  Encoding

## Labeld

- we are doing all ordinal features into a new df called ca_categorical_ordinal 

In [None]:
# labeled ordinal features 
# coverage
# education
# month
# policy 
ca_categorical_ordinal=pd.DataFrame(ca_categorical[["coverage", "education", "month", "policy"]])
ca_categorical_ordinal

In [None]:
ca_categorical_ordinal.coverage.value_counts()

In [None]:
coverage={"basic":1,"extended":2,"premium":3}


In [None]:
ca_categorical_ordinal[["coverage"]]=ca_categorical_ordinal[["coverage"]].replace(coverage)
ca_categorical_ordinal

In [None]:
ca_categorical_ordinal.education.value_counts()

In [None]:
education={"high school or below":1,"college":2,"bachelor":3, "master":4, "doctor":5}

In [None]:
ca_categorical_ordinal[["education"]]=ca_categorical_ordinal[["education"]].replace(education)
ca_categorical_ordinal

In [None]:
ca_categorical_ordinal.month.value_counts()

In [None]:
month={"jan":1,"feb":2}

In [None]:
ca_categorical_ordinal[["month"]]=ca_categorical_ordinal[["month"]].replace(month)
ca_categorical_ordinal

In [None]:
ca_categorical_ordinal.policy.value_counts()

In [None]:
policy={"personal l3":1,"personal l1":1,"personal l2":1,"corporate l3":2,"corporate l2":2, "corporate l1":2,"speical l3":3,"speical l2":3,"speical l1":3}

In [None]:
ca_categorical_ordinal[["policy"]]=ca_categorical_ordinal[["policy"]].replace(policy)
ca_categorical_ordinal

## Hot 
- we will put all nominal features in a new df called ca_categorical_hot

In [None]:
ca_categorical.info() 
#region
#response
#employment_status
#gender
#location_code
#martial_status
#policy_type
#policy 
#sales_channel
#vehicle_class
#vehicle_size

In [None]:
ca_categorical.gender.value_counts()

In [None]:
ca_categorical.response.value_counts()

In [None]:
ca_categorical.sales_channel.value_counts()

In [None]:
ca_categorical.location_code.value_counts()

In [None]:
ca_categorical.marital_status.value_counts()

In [None]:
ca_categorical.vehicle_size.value_counts()

In [None]:
ca_categorical.vehicle_class.value_counts()

In [None]:
ca_categorical.policy_type.value_counts()

In [None]:
#region
#response
#employment_status
#gender
#location_code
#martial_status
#policy_type
#policy 
#sales_channel
#vehicle_class
#vehicle_size

In [None]:
# all nominal features are getting hot encoded 
ca_categorical_hot=pd.get_dummies(ca_categorical[["region","response","gender","policy_type","employment_status","vehicle_class","vehicle_size","marital_status","location_code"]], drop_first=True)

In [None]:
ca_categorical_hot

- now we can concateinate ca_categorical_hot and ca_categorical_ordinal

- then insert the number_of_open_complaints
- and renew_offer_type 
    - look that you need to cut the str "offer" and then change the numbers from str to int

In [None]:
categorical_features=pd.concat([ca_categorical_hot,ca_categorical_ordinal],axis=1)
categorical_features

In [None]:
categorical_features.insert(0,"number_of_open_complaints",ca_categorical["number_of_open_complaints"])
categorical_features

In [None]:
ca_categorical.renew_offer_type.value_counts()

In [None]:
ca_categorical["renew_offer_type"]=ca_categorical["renew_offer_type"].map(lambda x: int(x.lstrip("offer")))

In [None]:
categorical_features.insert(0,"renew_offer_type",ca_categorical["renew_offer_type"])
categorical_features

# Check correlated features in numerical

In [None]:
ca_numerical

In [None]:
sns.pairplot(ca_numerical)

In [None]:
corr = ca_numerical.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(9, 7))
    ax = sns.heatmap(corr, mask=mask,cmap='coolwarm', vmin=-1,vmax=1,annot=True, square=True)

In [None]:
corr_matrix=ca_numerical.corr().abs()
upper_triangle=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
[column for column in upper_triangle.columns if any(upper_triangle[column] > 0.80)]

- no feature is above 0.8 corrleation thats why we keep all of them


# X-y split
- (y is the target variable, which is the total claim amount)


In [None]:
ca_numerical.info()

In [None]:
X=ca_numerical.drop('total_claim_amount', axis=1)
y=ca_numerical.total_claim_amount


In [None]:
X=X._get_numeric_data()

In [None]:
X

In [None]:
X.hist(figsize=(14,14))

# Train-test split
- Standardize the data (after the data split).

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.30,random_state=123)

In [None]:
std_scaler=StandardScaler().fit(X_train)   ##. finding the parameters ( mean, variance from the training set )

X_train_scaled=std_scaler.transform(X_train)

In [None]:
X_train.shape

In [None]:
X_test_scaled=std_scaler.transform(X_test)

In [None]:
print(X_train_scaled)
print("--------")
print(X_test_scaled)