<div style="text-align:center">
    <img src="../../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 12: Churn modeling project</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

#### Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
df = pd.read_csv("Churn.csv")

#### EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.skew()

#### Data Preprocessing

In [None]:
df.isnull().sum()

In [None]:
df.head()

#### Storytelling - Visualization

In [None]:
plt.hist(df['Age'])
plt.xlabel("Age")
plt.title("Age Distribution")
plt.show()

In [None]:
plt.scatter(df['Age'], df['Balance'],edgecolors='Red')
plt.title("Age Vs Balance")
plt.show()

In [None]:
sns.distplot(df["CreditScore"], label="Skewness : %.2f"%(df["CreditScore"].skew()))
plt.legend(loc="best");

In [None]:
plot = sns.kdeplot(df["Age"][(df["Exited"] == 0) & (df["Age"].notnull())], color="Red", shade = True)
plot = sns.kdeplot(df["Age"][(df["Exited"] == 1) & (df["Age"].notnull())], ax =plot, color="Blue", shade= True)
plot.set_xlabel("Age")
plot.set_ylabel("Frequency")
plot = plot.legend(["Not Churn","Churn"])

In [None]:
plot = sns.kdeplot(df["CreditScore"][(df["Exited"] == 0) & (df["CreditScore"].notnull())], color="Red", shade = True)
plot = sns.kdeplot(df["CreditScore"][(df["Exited"] == 1) & (df["CreditScore"].notnull())], ax =plot, color="Blue", shade= True)
plot.set_xlabel("Credit Score")
plot.set_ylabel("Frequency")
plot = plot.legend(["Not Churn","Churn"])

In [None]:
sns.histplot(data=df['Gender'])

In [None]:
sns.histplot(data=df['Geography'])

In [None]:
sns.pointplot(x="Geography", y="Age", hue="Gender", data=df)

In [None]:
sns.distplot(df["Balance"], label="Skewness : %.2f"%(df["Balance"].skew()))
plt.legend(loc="best")

In [None]:
sns.distplot(df["EstimatedSalary"], label="Skewness : %.2f"%(df["Balance"].skew()))
plt.legend(loc="best")

In [None]:
sizes = df['Geography'].value_counts(sort = True)
labels=df['Geography']
colors = ["grey","purple","red"] 

plt.pie(sizes,colors=colors,autopct='%1.1f%%',shadow=True,startangle=270)
plt.title('Geographical Area - Churn in Dataset')
plt.legend(labels)
plt.show()

In [None]:
sizes = df['Exited'].value_counts(sort = True)
labels=df['Exited']
colors = ["purple","red"] 

plt.pie(sizes,colors=colors,autopct='%1.1f%%',shadow=True,startangle=270)
plt.title('Exit Customers - Churn in Dataset')
plt.legend(labels)
plt.show()

In [None]:
plt.boxplot(df['Age'])
plt.xlabel("Tenure")
plt.show()

In [None]:
df['Exited'].hist()

In [None]:
corr_data = pd.DataFrame(df)
plt.figure(figsize=(18,10))
sns.heatmap(corr_data.corr(),annot=True,linewidths=2)

#### Data Preprocessing

In [None]:
df.drop(['RowNumber', 'CustomerId', 'Surname'],axis=1,inplace=True)

In [None]:
df.head()

#### Encoding

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.head()

Convert Exited to categorical

In [None]:
df['Exited'] = df['Exited'].astype('category')

In [None]:
df.info()

#### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# Don't scale the target variable
target = df['Exited']
df.drop(['Exited'],axis=1,inplace=True)
columns = df.columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)
df = pd.DataFrame(scaled_features, columns=columns)
df['Exited'] = target

In [None]:
df.head()

#### Train and test

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Exited',axis=1), df['Exited'], test_size = 0.2, random_state = 0)

##### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(X_train,y_train)
y_pred_lr = model_lr.predict(X_test)

print("Logistic Regression Accuracy: ", model_lr.score(X_test, y_test))
print("Logistic Regression Cross validation score: ", np.mean(cross_val_score(model_lr, X_train, y_train, cv=10)))
print("Logistic Regression Classification report:\n", metrics.classification_report(y_test, y_pred_lr))

sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt="d")
plt.title('Logistic Regression Confusion matrix', y=1.05, size=15)

##### XGBoost

In [None]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier()
model_xgb.fit(X_train,y_train)
y_pred_xgb = model_xgb.predict(X_test)

print("XGBoost Accuracy: ", model_xgb.score(X_test, y_test))
print("XGBoost Cross validation score: ", np.mean(cross_val_score(model_xgb, X_train, y_train, cv=10)))
print("XGBoost Classification report:\n", metrics.classification_report(y_test, y_pred_xgb))

sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt="d")
plt.title('XGBoost Confusion matrix', y=1.05, size=15)

##### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = GradientBoostingClassifier()
model_gb.fit(X_train,y_train)
y_pred_gb = model_gb.predict(X_test)

print("Gradient Boosting Accuracy: ", model_gb.score(X_test, y_test))
print("Gradient Boosting Cross validation score: ", np.mean(cross_val_score(model_gb, X_train, y_train, cv=10)))
print("Gradient Boosting Classification report:\n", metrics.classification_report(y_test, y_pred_gb))

sns.heatmap(confusion_matrix(y_test, y_pred_gb), annot=True, fmt="d")
plt.title('Gradient Boosting Confusion matrix', y=1.05, size=15)

##### KNN or k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier()
model_knn.fit(X_train,y_train)
y_pred_knn = model_knn.predict(X_test)

print("KNN Accuracy: ", model_knn.score(X_test, y_test))
print("KNN Cross validation score: ", np.mean(cross_val_score(model_knn, X_train, y_train, cv=10)))
print("KNN Classification report:\n", metrics.classification_report(y_test, y_pred_knn))

sns.heatmap(confusion_matrix(y_test, y_pred_knn), annot=True, fmt="d")
plt.title('KNN Confusion matrix', y=1.05, size=15)

##### Support Vector Machines (SVC)

In [None]:
from sklearn.svm import SVC

model_svm = SVC(kernel="linear", C=1, gamma=0.1)
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)

print("SVM Accuracy: ", model_svm.score(X_test, y_test))
print("SVM Cross validation score: ", np.mean(cross_val_score(model_svm, X_train, y_train, cv=10)))
print("SVM Classification report:\n", metrics.classification_report(y_test, y_pred_svm))

sns.heatmap(confusion_matrix(y_test, y_pred_svm), annot=True, fmt="d")
plt.title('SVM Confusion matrix', y=1.05, size=15)

##### Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

y_pred_nb = GaussianNB()
y_pred_nb = y_pred_nb.fit(X_train,y_train)
predict_nb = y_pred_nb.predict(X_test)

print("Naive Bayes Accuracy: ", y_pred_nb.score(X_test, y_test))
print("Naive Bayes Cross validation score: ", np.mean(cross_val_score(y_pred_nb, X_train, y_train, cv=10)))
print("Naive Bayes Classification report:\n", metrics.classification_report(y_test, predict_nb))

sns.heatmap(confusion_matrix(y_test, predict_nb), annot=True, fmt="d")
plt.title('Naive Bayes Confusion matrix', y=1.05, size=15)

##### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(criterion = "gini")
model_dt.fit(X_train, y_train)
y_pred_rf = model_dt.predict(X_test)

print("Decision Tree Accuracy: ", model_dt.score(X_test, y_test))
print("Decision Tree Cross validation score: ", np.mean(cross_val_score(model_dt, X_train, y_train, cv=10)))
print("Decision Tree Classification report:\n", metrics.classification_report(y_test, y_pred_rf))

sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt="d")
plt.title('Decision Tree Confusion matrix', y=1.05, size=15)

##### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=100,max_depth=10, random_state=100)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

print("Random Forest Accuracy: ", model_rf.score(X_test, y_test))
print("Random Forest Cross validation score: ", np.mean(cross_val_score(model_rf, X_train, y_train, cv=10)))
print("Random Forest Classification report:\n", metrics.classification_report(y_test, y_pred_rf))

sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt="d")
plt.title('Random Forest Confusion matrix', y=1.05, size=15)

##### Artificial neural network (ANN)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU, PReLU, ELU
from keras.layers import Dropout

In [None]:
model_ann = Sequential()
model_ann.add(Dense(units = 10, kernel_initializer = "he_normal", activation = "relu", input_dim = 11))
model_ann.add(Dropout(0.3))
model_ann.add(Dense(units=20, kernel_initializer="he_normal", activation = "relu"))
model_ann.add(Dropout(0.4))
model_ann.add(Dense(units=15, kernel_initializer="he_normal", activation = "relu"))
model_ann.add(Dropout(0.2))
model_ann.add(Dense(units=1, kernel_initializer = "glorot_uniform", activation = "sigmoid"))
model_ann.summary()

In [None]:
model_ann.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics =['accuracy'])

In [None]:
model_ann_fit = model_ann.fit(X_train, y_train, validation_split = 0.25, batch_size=10, epochs=100)

In [None]:
print(model_ann_fit.history.keys())

In [None]:
plt.plot(model_ann_fit.history['accuracy'])
plt.plot(model_ann_fit.history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epochs")
plt.legend(['train','test'], loc = 'upper left')
plt.show()

In [None]:
plt.plot(model_ann_fit.history['loss'])
plt.plot(model_ann_fit.history['val_loss'])
plt.title("Model Accuracy")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend(['train','test'], loc = 'upper left')
plt.show()

In [None]:
y_pred_ann = model_ann.predict(X_test)

In [None]:
y_pred_ann

In [None]:
y_pred_ann = (y_pred_ann > 0.5)

In [None]:
y_pred_ann

In [None]:
cm = confusion_matrix(y_test, y_pred_ann)
cm

In [None]:
acc_sc = accuracy_score(y_test, y_pred_ann)
acc_sc