# Import libraries

In [None]:
import pandas as pd #data frames (for storing data)
import numpy as np #scientific computing
import itertools

#matplotlib for plotting
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.ticker as mtick #for percentage ticks
import scikitplot as skplt
import seaborn as sns

# Rebalancing data 
from imblearn.over_sampling import SMOTE # Upsampling the minority class
from imblearn.pipeline import Pipeline, make_pipeline 

# Classification import
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Model selection 
from sklearn.model_selection import train_test_split #Data split function
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# Model performance evaluation
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import log_loss

# Plotting outlier
import plotly.express as px

# 1. Load data 

In [None]:
data = pd.read_csv('marketing_data.csv')
data.dropna()
data.head(10).style #add .style to show all columns (otherwise some columns will be hidden with "...")

In [None]:
data.shape
data.info()

# 2. Exploratory Data Analysis 

## 2.1 Check NULL (missing) values and abnormal values

In [None]:
pd.DataFrame(data.isnull().sum(), columns=['#Null values']).T

We observe 
- the column " Income" show some missing values (all variables show 1100 entries)
- The column name ' Income ' has extra space so we need to strip it

#### Income

In [None]:
# Strip extra space in the column name 
data = data.rename(columns=lambda x: x.strip())

In [None]:
# Remove the '$' sign from the Income and converting the dtype from 'object' to 'float'
def income_convert(x):
    try: 
        return float(x.split('$')[1].split('.')[0].replace(',',''))
    except AttributeError: # as there are some missing values 
        return np.NaN
    
data['Income'] =  data['Income'].apply(income_convert) 
data['Income']= data['Income'].fillna(data['Income'].median())

#### Marital Status and Education 

In [None]:
print(data['Marital_Status'].unique())
print(data['Education'].unique())

In [None]:
# In Marital status, the attribute "YOLO", "ABSURD" should be considered the same as "single"
data.Marital_Status.replace({'Alone':'Single','YOLO':'Single','Absurd':'Single'}, inplace = True)
# In education level, the value '2n Cycle' is technically euivalent to 'Master'
data.Education.replace({'2n Cycle':'Master'}, inplace = True)
print(data.Marital_Status.unique())
print(data.Education.unique())

#### Transform Year_Birth to Age and Dt_Customer to Enrollment duration

In [None]:
# Transform the year_birth into age 
from datetime import date
Age = date.today().year - data['Year_Birth']
data.insert(1, 'Age', Age,)
data = data.drop(columns=['ID','Year_Birth'])

# Transform the Dt_customer into enrollment duration
dtime = pd.to_datetime(data['Dt_Customer'])
Enroll_duration = date.today().year - dtime.dt.year
data.insert(7, 'Enrollment_duration', Enroll_duration)
data = data.drop(columns=['Dt_Customer'])

## 2.2 Check outliers  

### Age

In [None]:
#create a box plot

fig_age = px.box(data, y= 'Age')

fig_age.show()

We have notice that there are 3 people at age 122, 123, and 129, which is quite impossible. Therefore, we can conclude that there are some mistakes in this data collection. In this situation, we decide to either remove that observations for those people.


In [None]:
data = data[data['Age'] <= 85]
data['Age'].unique()

In [None]:
#create a box plot
fig_income = px.box(data, y = 'Income')
fig_income.update_layout(width = 600, height = 400)
fig_income.show()

According to the box plot, the income has three outliers, so we can replace them with the median of the Income 

In [None]:
data.loc[data['Income'] > 162397, 'Income'] = int(data['Income'].median())
print(data['Income'].max())
print(data['Income'].mean())

## 2.3 Some descriptive statistics

In [None]:
data.describe().T

# 3. Data Visualisation

## 3.1 Response vs non-response situation 

In [None]:
# plot reponse vs. non-response 
#c = ['#0E4C92', '#daf0ff']
c = ['#0E4C92', '#77C6FC']
keys, counts = np.unique(data.Response, return_counts=True)
counts_norm = counts/counts.sum()
fig = plt.figure(figsize=(3, 5)) #specify figure size
ax1 = plt.bar(['Data'], [counts_norm[0]], label='no response', color=c[0])
ax1 = plt.bar(['Data'], [counts_norm[1]], bottom=counts_norm[0], label='response', color=c[1])
ax1 = plt.legend(bbox_to_anchor=(1, 1))
ax1 = plt.ylabel('frequency')
ax1 = plt.text(['Data'],counts_norm[0]/2, '{}%'.format((counts_norm[0]*100).round(1)), color = 'white', fontweight = 'semibold')
ax1 = plt.text(['Data'],(counts_norm[1]/2)+counts_norm[0], '{}%'.format((counts_norm[1]*100).round(1)))

plt.show()

We observe that the non-response cases account for about 85.1% of all observations. This imbalanced distribution of the response variable (response) occurs in many real-life Data Science problems and requires careful consideration when designing a classification model.

## 3.2 Response by some important features 

In [None]:
y_response = data.Response[data['Response']==1]

#### Reponse by education level

In [None]:
%matplotlib inline
ax = pd.crosstab(data.Education, y_response).plot(kind='bar', color=c)
plt.title('Response by Education')
plt.xlabel('Education')

#### Reponse by Maritial_Status 

In [None]:
%matplotlib inline
ax = pd.crosstab(data.Marital_Status, y_response).plot(kind='bar', color=c)
plt.title('Response by Marital Status')
plt.xlabel('Marital Status')

#### Resopnse by country

In [None]:
%matplotlib inline
ax = pd.crosstab(data.Country, y_response).plot(kind='bar', color=c)
plt.title('Response by Country')
plt.xlabel('Country')

In [None]:
import seaborn as sns
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(25,25))
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

## 4. Data cleaning & Pre-processing 

#### 4.1 Remove variables that have no explanatory power or have beend stranformed 

In [None]:
data = data.drop(columns=['Age', 'Kidhome', 'Teenhome', 'Recency', 'NumWebVisitsMonth', 'Complain'])

#### 4.2 Encode categorical variables

In [None]:
data = pd.get_dummies(data, columns=["Education", 'Marital_Status', 'Country'], drop_first = True) #we add a prefix for easier identification
data.head().style

In [None]:
data.shape

## 5. Data split 

To simulate this, we split our dataset into two subsets: training and testing. We use the training partition to build the model and the testing partition to evaluate the model performance.

We split the data 70:30 into a training (data_train) and a testing (data_test) partition. Furthermore, we split the dataset into a feature matrix X (all columns, except the target fraudulent column) and a label vector y (only the fraudulent column).

In [None]:
X, y = data.loc[:, data.columns != 'Response'], data['Response'] #define feature matrix X and labels y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 12345) #split data 70:30

We visualize the split to ensure that the distribution of fraudulent to non-fraudulent cases matches the distribution in the full dataset.

In [None]:
print('Train_x:',X_train.shape)
print('Train_y:',y_train.shape)
print('Test_x:',X_test.shape)
print('Test_y:',y_test.shape)

##  6. Rebalancing using SMOTE

In [None]:
smote = SMOTE(sampling_strategy='minority', random_state = 0)
X_sm, y_sm = smote.fit_resample(X_train, y_train) #ONLY APPLIED TO TRAINING!!!

In [None]:
train_dist = y_train.value_counts() / len(y_train) #normalize absolute count values for plotting
test_dist = y_test.value_counts() / len(y_test)
data_dist = y.value_counts() / len(y)
smote_dist = pd.Series(y_sm).value_counts() / len(pd.Series(y_sm))

fig, ax = plt.subplots()

ax.barh(['X_train (SMOTE)','Test','Train','Data'], [smote_dist[0], test_dist[0], train_dist[0], data_dist[0]], color=c[0], label='0 (no)')
ax.barh(['X_train (SMOTE)','Test','Train','Data'], [smote_dist[1], test_dist[1], train_dist[1], data_dist[1]], left=[smote_dist[0], test_dist[0], train_dist[0], data_dist[0]], color=c[1], label='1 (yes)')
ax.set_title('Split visualization', size = 15)
ax.legend(loc='upper left')
plt.xlabel('Proportion')
plt.ylabel('Partition')

#plot bar values
for part, a, b in zip(['X_train (SMOTE)', 'Test', 'Train','Data'], [smote_dist[0], test_dist[0], train_dist[0], data_dist[0]], [smote_dist[1], test_dist[1], train_dist[1], data_dist[1]]):
    plt.text(a/2, part, str(np.round(a, 2)), color = 'white', fontweight = 'semibold')
    plt.text(b/2+a, part, str(np.round(b, 2)), fontweight = 'medium');

# 7. Modeling


## 7.1 Logistic Regression

### 7.1.1 Build model with balanced data (SMOTE)

In [None]:
model_LR = LogisticRegression()
pipeline_LR= make_pipeline(SMOTE(random_state = 0), model_LR)

cross_validation = StratifiedKFold(n_splits= 5, random_state = 0 , shuffle = True)
parameters = {'C':[0.8,0.9,1,1.1,1.2],'random_state':[42], 'solver':['liblinear']} 
newLR_params = {'logisticregression__' + key: parameters[key] for key in parameters}
model_LR_leGrid = GridSearchCV(pipeline_LR, param_grid= newLR_params, cv=cross_validation, scoring = 'accuracy', error_score=0)
model_LR_leGrid.fit(X_train, y_train) #define Logistic Reg, ression classifier
print("Best score: ", (model_LR_leGrid.best_score_*100).round(2))
print("The best parameters:", model_LR_leGrid.best_params_)

In [None]:
# LR model with balanced data 
lr_SMOTE = LogisticRegression(solver='liblinear', random_state= 42, C = 1.1)
lr_SMOTE.fit(X_sm,y_sm)
lrSMOTE_pred = lr_SMOTE.predict(X_test)
print('SMOTE Training set accuracy: {:.2%}'.format(lr_SMOTE.score(X_train, y_train)))
print('SMOTE Test set accuracy: {:.2%}'.format(lr_SMOTE.score(X_test, y_test)))
print('Test error:', (log_loss(y_test, lrSMOTE_pred)).round(3))
print()

# LR model with imblanced data 
# LR model with imbalanced data 
lr_UB = LogisticRegression(solver='liblinear', random_state= 42, C = 1)
lr_UB.fit(X_train, y_train)
lrUB_pred = lr_UB.predict(X_test)
print('Unbalanced Training set accuracy: {:.2%}'.format(lr_UB.score(X_train, y_train)))
print('Unbalanced Test set accuracy: {:.2%}'.format(lr_UB.score(X_test, y_test)))
print('Test error:', (log_loss(y_test, lrUB_pred)).round(3))

## 7.2 Decision Tree 

### 7.1.1 Gridsearch Cross Validation

In [None]:
model_tree = DecisionTreeClassifier()
parameters = {'criterion' : ['gini', 'entropy'],'max_depth':[3,4,5,6], 'random_state': [13,42]}
pipeline_rfc= make_pipeline(SMOTE(random_state = 0), model_tree)
#print(pipeline)
#print(model_rfc.get_params().keys())
cross_validation = StratifiedKFold(n_splits = 5, random_state = 0 , shuffle = True)
newLR_params = {'decisiontreeclassifier__' + key: parameters[key] for key in parameters}
model_tree_legrid = GridSearchCV(pipeline_rfc, param_grid= newLR_params, cv=cross_validation, scoring = 'accuracy', error_score=0)
model_tree_legrid.fit(X, y) #define Logistic Reg, ression classifier
print('Decision Tree Classifier Cross validation score:', (model_tree_legrid.best_score_*100).round(2))
print(model_tree_legrid.best_params_)

### 7.2.1 Build the model and train it 

In [None]:
# Decision tree model with balanced data 
tree_SMOTE = DecisionTreeClassifier( criterion = 'gini', max_depth = 6, random_state = 42)
tree_SMOTE.fit(X_sm,y_sm)
treeSMOTE_pred = tree_SMOTE.predict(X_test)
print('SMOTE Training set accuracy: {:.2%}'.format(tree_SMOTE.score(X_train, y_train)))
print('SMOTE Test set accuracy: {:.2%}'.format(tree_SMOTE.score(X_test, y_test)))
print('Test error:', (log_loss(y_test, treeSMOTE_pred)).round(3))
print()

# Decision tree model with imbalanced data
tree_UB = DecisionTreeClassifier( criterion = 'gini', max_depth = 6, random_state = 42)
tree_UB.fit(X_train, y_train)
treeUB_pred = tree_UB.predict(X_test)
print('Unbalanced Training set accuracy: {:.2%}'.format(tree_UB.score(X_train, y_train)))
print('Unbalanced Test set accuracy: {:.2%}'.format(tree_UB.score(X_test, y_test)))

## 7. Model Evaluation 

In this project, we will use the following evaluation metrics:
  - Confusion matrix
  - Precision, recal, F-measure and support
 

### 7.1 Confustion Matrix 

In [None]:
lr_SMOTE_cf = confusion_matrix(y_test, lrSMOTE_pred)
lr_UB_cf = confusion_matrix(y_test, lrUB_pred)
tree_SMOTE_cf = confusion_matrix(y_test, treeSMOTE_pred)
tree_UB_cf = confusion_matrix(y_test, treeUB_pred)

# Visualize the confusiuon matrix
plt.figure(figsize=(12, 10))
plt.subplot()

ax1= plt.subplot(2,2,1)
sns.heatmap(lr_UB_cf, annot=True, fmt='g', ax=ax1, cmap="Blues")
ax1.set_xlabel('Predicted labels')
ax1.set_ylabel('True labels')
ax1.set_title('Logistic Regression Unbalanced Model',fontsize=15)

ax2= plt.subplot(2,2,2)
sns.heatmap(lr_SMOTE_cf, annot=True, fmt='g', ax=ax2, cmap="Blues")
ax2.set_xlabel('Predicted labels')
ax2.set_ylabel('True labels')
ax2.set_title('Logistic Regression SMOTE Model',fontsize=15)

ax3= plt.subplot(223)
sns.heatmap(tree_UB_cf, annot=True, fmt='g', ax=ax3, cmap="Blues")
ax3.set_xlabel('Predicted labels')
ax3.set_ylabel('True labels')
ax3.set_title('Decision Tree Unbalanced Model',fontsize=15)

ax4= plt.subplot(224)
sns.heatmap(tree_SMOTE_cf, annot=True, fmt='g', ax=ax4, cmap="Blues")
ax4.set_xlabel('Predicted labels')
ax4.set_ylabel('True labels')
ax4.set_title('Decision Tree SMOTE Model',fontsize=15)

plt.tight_layout()
plt.show()

## 7.2 Precision, Recall, F-measure and support

In [None]:
# For balanced data 
from sklearn.metrics import classification_report
print('Logistic Regression SMOTE data\n\n', classification_report(y_test, lrSMOTE_pred))
print('Logistic Regression unbalanced data\n\n', classification_report(y_test, lrUB_pred))
print('Decision Tree SMOTE data\n\n', classification_report(y_test, treeSMOTE_pred))
print('Decision Tree unbalanced data\n\n', classification_report(y_test, treeUB_pred))