In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data Exploration and Pre-processing

### 1) load the given dataset

In [2]:
df = pd.read_csv('churn_telecom.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'churn_telecom.csv'

In [None]:
df.shape

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
df

In [None]:
df.info()

### 2) print all the column names

In [None]:
df.columns

In [None]:
df['State'].unique()

### 3) describe the data

In [None]:
df.describe()

### 4) find all the Null values

In [None]:
df.isnull().sum()

### 5) plot the customers who have international plans

In [None]:
sns.countplot('International plan',data=df)
print(df['International plan'].value_counts())

### 6) plot the customers who have Voice mail plan

In [None]:
sns.countplot('Voice mail plan',data=df)
print(df['Voice mail plan'].value_counts())

### 7) Plot the total day calls

In [None]:
sns.set(rc={'figure.figsize':(12,6)})
sns.distplot(df['Total day calls'])

### 8) Plot the total day charge

In [None]:
sns.set(rc={'figure.figsize':(12,6)})
sns.distplot(df['Total day charge'], color='red')

### 9) Display pie chart for value count in Churn column

In [None]:
df['Churn'].value_counts()
df['Churn'].unique()

In [None]:
values = [len(df['Churn'][df['Churn']==False]),len(df['Churn'][df['Churn']==True])]
plt.pie(values,labels=df['Churn'].unique())

### 10) Display a scatter plot between total day calls and total day charges

In [None]:
plt.xlabel('Total Day Calls')
plt.ylabel('Total Day Charge')
plt.scatter(df['Total day calls'],df['Total day charge'])

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.scatterplot(data=df,x="Total day calls",y="Total day charge",hue="Churn")

### 11) Display a scatter plot between total day calls and total night calls

In [None]:
plt.xlabel('Total Day Calls')
plt.ylabel('Total Night Calls')
plt.scatter(df['Total day calls'],df['Total night calls'],c='green')

In [None]:
sns.scatterplot(data=df,x="Total day calls",y="Total night charge",hue="Churn")

In [None]:
#sns.set(rc={'figure.figsize':(12,8)})
#sns.relplot(x='Total day calls',y='Total night calls',hue='Churn',data=df)

### 12) Display a boxplot of Total day minutes with respect to Churn

In [None]:
sns.set(rc={'figure.figsize':(8,6)})
sns.boxplot('Churn','Total day minutes',data=df)

### 13) Display a boxplot of Total day charge with respect to Churn

In [None]:
sns.set(rc={'figure.figsize':(8,6)})
sns.boxplot('Churn','Total day charge',data=df)

# Working with models

### 1) Perform encoding on churn

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
df1 = df.copy()

In [None]:
df1['Churn'] = df['Churn'] = LabelEncoder().fit_transform(df1['Churn'])

In [None]:
df1['Churn']

### 2) Perform encoding on International Plan

In [None]:
df1['International plan'] = LabelEncoder().fit_transform(df1['International plan'])

In [None]:
df1['International plan']

### 3) Perform encoding on voice mail plan using sklearn

In [None]:
df1['Voice mail plan'] = LabelEncoder().fit_transform(df1['Voice mail plan'])

In [None]:
df1['Voice mail plan']

### 4) Check the correlation among all the columns

In [None]:
df1.corr()

In [None]:
plt.figure(figsize = (12,8))
sns.heatmap(df1.corr(), cmap = "rainbow")

### 5) Create features and target data. Only select features data that are highly correlated with target data.

In [None]:
from sklearn.feature_selection import SelectKBest , chi2

In [None]:
X = df1.drop(['State','Churn'],axis=1)
Y = df.iloc[:,-1]

Features = SelectKBest(score_func=chi2 , k=10)
Best_feat = Features.fit(X,Y)

dfscores = pd.DataFrame(Best_feat.scores_)
df_X = pd.DataFrame(X.columns)

best_feat = pd.concat([df_X,dfscores],axis=1)

best_feat.columns = ['Features','Scores']
BF = pd.DataFrame(best_feat.nlargest(10,'Scores'))
print(BF)

### 6) Scale the target data (churn)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
STD = StandardScaler()

In [None]:
df1['Churn'] = STD.fit_transform(df1[['Churn']])
df1['Churn']

### 7) Check the shape of both training data and testing data

In [None]:
X = df1[BF['Features']]
Y = df['Churn']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 42)

In [None]:
print("X Training size : " , X_train.shape)
print("Y Training size : " , Y_train.shape)
print("X Testing size : " , X_test.shape)
print("Y Testing size : " , Y_test.shape)

In [None]:
BF1 = pd.DataFrame(best_feat.nlargest(5,'Scores'))
BF1['Churn'] = df['Churn'].copy()
df2 = df1[BF['Features']]
df2['Churn'] = df['Churn'].copy()
df2['Churn']
df3 = df1[BF1['Features']]
df3['Churn'] = df['Churn'].copy()
df3

In [None]:
sns.pairplot(df3,hue="Churn")

### 8) Apply Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
Logit = LogisticRegression()

In [None]:
Logit.fit(X_train,Y_train)

In [None]:
print("Trainng score : ",Logit.score(X_train,Y_train))
print("Testing score : ",Logit.score(X_test,Y_test))

### 9) Display confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
Y_pred = Logit.predict(X_test)

In [None]:
confusion = confusion_matrix(Y_test,Y_pred)
confusion

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(confusion,annot=True)
plt.xlabel('Predicted label')
plt.ylabel('Actual label')

### 10) Perform Hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
BF1 = pd.DataFrame(best_feat.nlargest(5,'Scores'))
X1 = df1[BF1['Features']]
Y1 = df['Churn'].copy()

In [None]:
from sklearn.model_selection import train_test_split
X1_train,X1_test,Y1_train,Y1_test = train_test_split(X1,Y1,test_size = 0.2, random_state = 42)

In [None]:
X1_train.shape

In [None]:
Y1_train.shape

In [None]:
#n_estimators = [int(x) for x in np.linspace(start=50 , stop = 1000, num = 5)]

# Grid Search CV

grid = GridSearchCV(SVC(gamma='auto'),
      {
        'C' : [1,10,20],
       'kernel' : ['rbf','linear'],
        'random_state' : [0,5,42],      
    },
    cv=5,
    n_jobs=-1,
    return_train_score=True,
)

### 11) Create a model

In [None]:
grid.fit(X1_train,Y1_train)
score = pd.DataFrame(grid.cv_results_)

In [None]:
score

### 12) Check the model score of both training and testing data

In [None]:
score[['params','mean_test_score','mean_train_score']].head()

### 13) Perform cross validation technique with SVM Classifier

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
kernels = ['linear','poly']
C = [1,10,20]
Avg_score = {}

for kval in kernels:
    for cval in C:
        cv_score = cross_val_score(SVC(C=cval , kernel=kval , gamma='auto'),X_train,Y_train,cv=5,n_jobs=-1)
        Avg_score['C Value : ' + str(cval)] = np.average(cv_score)

Avg_score

### 14) Perform hyperparameter tuning with different classifier models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_params = {
    'SVM' : {
        'Model' : SVC(gamma='auto'),
        'Params' : {
            'C' : [1,5,10],
          # 'kernel' : ['rbf','linear']
        }
    },
    
    'Random Forest' : {
        'Model' : RandomForestClassifier(),
        'Params' : {
            'n_estimators' : [10,20,30,40],
            'random_state' : [0,5,42]
        }
    },
    
     'Decision Tree' : {
        'Model' : DecisionTreeClassifier(),
        'Params' : {
            
            'criterion' : ['gini','entropy'],
            'splitter' : ['best','random'],
            'random_state' : [0,5,42]
        }
    },
    
    'Logistic Regression' : {
        'Model' : LogisticRegression(solver='liblinear',multi_class='auto'),
        'Params' : {
            'C' : [1,5,10]
        }
    }
}

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['Model'],mp['Params'],cv=5,return_train_score=False,n_jobs=-1)
    clf.fit(X_train,Y_train)

    scores.append({
        'Model' : model_name,
        'Best Score': clf.best_score_,
        'Best Parameters': clf.best_params_
    })

In [None]:
HPT = pd.DataFrame(scores)
HPT

### 15) Perform k-means clustering on dataset and divide it into four clusters

In [None]:
from sklearn.cluster import KMeans

In [None]:
df2 = X.copy()
df2['Churn'] = df['Churn'].copy()

In [None]:
KM = KMeans(n_clusters=4)
KM.fit(df2)

In [None]:
Predicted = KM.fit_predict(df2)
df2['Cluster'] = Predicted

In [None]:
cluster_1 = df2[df2.Cluster==0]
cluster_2 = df2[df2.Cluster==1]
cluster_3 = df2[df2.Cluster==2]
cluster_4 = df2[df2.Cluster==3]

In [None]:
plt.scatter(cluster_1.iloc[:,0],cluster_1.iloc[:,6],color='red')
plt.scatter(cluster_2.iloc[:,0],cluster_2.iloc[:,6],color='green')
plt.scatter(cluster_3.iloc[:,0],cluster_3.iloc[:,6],color='blue')
plt.scatter(cluster_4.iloc[:,0],cluster_4.iloc[:,6],color='purple')
plt.scatter(KM.cluster_centers_[:,0],KM.cluster_centers_[:,1],color='black',marker='D',label = 'Centroid')
plt.xlabel('Total day minutes')
plt.ylabel('Total night minutes')

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
print(silhouette_score(df2,Predicted))

### 16) Apply PCA give n components value to 3 show we only get 3 columns after applying PCA

In [None]:
df3 = df1.copy()
df3.drop(['State','Churn'],axis=1,inplace=True)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
Scaler = StandardScaler()
Scaler.fit(df3)
Scaled_data = Scaler.transform(df3)

In [None]:
Scaled_data

In [None]:
pca = PCA(n_components=3)
pca.fit(Scaled_data)
x_pca = pca.transform(Scaled_data)

In [None]:
x_pca.shape