In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import model_selection
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('telecom.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7853-GVUDZ,Female,0,Yes,Yes,1,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Credit card (automatic),20.15,20.15,Yes
1,6893-ODYYE,Male,0,No,No,50,Yes,No,Fiber optic,Yes,...,Yes,Yes,No,No,One year,Yes,Credit card (automatic),90.1,4549.45,No
2,0486-HECZI,Male,0,Yes,No,55,Yes,Yes,Fiber optic,Yes,...,No,No,Yes,No,Month-to-month,Yes,Electronic check,96.75,5238.9,Yes
3,8972-HJWNV,Female,1,Yes,No,7,Yes,Yes,Fiber optic,No,...,No,No,No,Yes,Month-to-month,Yes,Electronic check,84.55,646.85,Yes
4,9317-WZPGV,Female,1,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Yes,Month-to-month,Yes,Bank transfer (automatic),79.75,159.4,Yes


In [17]:
final_test=df.sample(n=1000,random_state=10).reset_index(drop=True)

In [18]:
dt=df.drop(final_test.index).reset_index(drop=True)

## Preprocessing

In [19]:
print("Missing values:", dt.isnull().sum().values.sum())
print("\nUnique values:\n",dt.nunique())
dt.shape[0],df.shape[1]

Missing values: 0

Unique values:
 customerID          5000
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1440
TotalCharges        4716
Churn                  2
dtype: int64


(5000, 21)

In [20]:
def preprocessing(dt):
    
    df = dt.copy()
    df.drop(['customerID', 'gender'], axis =1, inplace=True)
    df['TotalCharges']=df["TotalCharges"].replace(" " ,np.nan)
    df=df.fillna(0)
    df['TotalCharges']=df["TotalCharges"].astype(float)
    
    column_list = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport','StreamingTV', 'StreamingMovies']
    for i in column_list:
        df[i]  = df[i].replace({'No internet service' : 'No'})
    
    df['MultipleLines']= df['MultipleLines'].replace({'No phone service':'No'})
    df['Contract']= df['Contract'].replace({'Month-to-month':1, 'One year':12, 'Two year':24})

    
    column_list = ['Partner', 'Dependents','OnlineSecurity', 'PhoneService', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies', 'MultipleLines', 'PaperlessBilling']
    for i in column_list:
        df[i]  = df[i].replace({'Yes': 1,'No': 0})
    
    y = df['Churn'].replace({'Yes':1,'No':0})
    df.drop(['Churn'], axis = 1, inplace=True)

    dt = pd.get_dummies(df)
    dt = pd.DataFrame(columns = dt.columns, data = MinMaxScaler().fit(dt).transform(dt))
    
    return dt, y

In [21]:
X_train, y_train=preprocessing(dt)

## Building 5 classification models

In [22]:
def train(model, dt, y, folds=10):
    kf = KFold(n_splits=folds, random_state=1)
    test_score = []
    train_score = []
    precision = []
    recall =[]
    

    for train_index, test_index in kf.split(dt):
        xTrain, xTest = dt.loc[train_index], dt.loc[test_index]
        yTrain, yTest = y[train_index], y[test_index]

        m_clf = model.fit(xTrain, yTrain)
        predict_test = model.predict(xTest)
        predict_train = model.predict(xTrain)
        
        test_score.append(metrics.accuracy_score(yTest, predict_test.round()))
        train_score.append(metrics.accuracy_score(yTrain, predict_train.round()))
        precision.append(metrics.precision_score(yTest, predict_test.round()))
        recall.append(metrics.recall_score(yTest, predict_test.round()))
    
    print("Test Accuracy:",np.mean(test_score))
    print("Train Accuracy:",np.mean(train_score))
    print("Precision:", np.mean(precision))
    print("Recall:", np.mean(recall))

In [23]:
def compare_models(X,y):

    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('KNN', KNeighborsClassifier(n_neighbors=4)))
    models.append(('CART', DecisionTreeClassifier(max_depth=5)))
    models.append(('SVM',svm.SVC()))
    models.append(('RF', RandomForestRegressor(n_estimators=100, max_depth=5)))

    for i, j in models:
        print('\n'+ i+':')
        train(j, X, y)

In [24]:
compare_models(X_train,y_train)


LR:
Test Accuracy: 0.8054
Train Accuracy: 0.8086666666666666
Precision: 0.6543782539049802
Recall: 0.5687217683429007

KNN:
Test Accuracy: 0.7584
Train Accuracy: 0.8353555555555555
Precision: 0.5709893158688898
Recall: 0.36014722934795035

CART:
Test Accuracy: 0.791
Train Accuracy: 0.8082888888888888
Precision: 0.6361167681730627
Recall: 0.5218121186345164

SVM:
Test Accuracy: 0.7974
Train Accuracy: 0.8208444444444444
Precision: 0.6576067896982721
Recall: 0.49588131949948056

RF:
Test Accuracy: 0.8013999999999999
Train Accuracy: 0.8148444444444445
Precision: 0.6630068255613685
Recall: 0.5186384083953602


In [25]:
X_test, y_test = preprocessing(final_test)

In [26]:
def test(dt, y):
    models = []
    stats=['Test Accuracy','Train Accuracy', 'Precision', 'Recall']
    names=['LR','KNN', 'CART', 'SVM', 'RF']
    data=[]
    models.append(LogisticRegression())
    models.append(KNeighborsClassifier(n_neighbors=4))
    models.append(DecisionTreeClassifier(max_depth=5))
    models.append(svm.SVC(kernel='linear'))
    models.append(RandomForestRegressor(n_estimators=100, max_depth=5))
    
    for model in models:
        
        m_clf = model.fit(X_train, y_train)
        predict_test = model.predict(dt)
        predict_train =model.predict(X_train)
        
        data.append(metrics.accuracy_score(y, predict_test.round()))    
        data.append(metrics.accuracy_score(y_train, predict_train.round()))
        data.append(metrics.precision_score(y, predict_test.round()))
        data.append(metrics.recall_score(y, predict_test.round()))
        

    data=np.array(data).reshape(5,4)    
    return pd.DataFrame(data, index = names, columns = stats)

In [27]:
test(X_test,y_test)

Unnamed: 0,Test Accuracy,Train Accuracy,Precision,Recall
LR,0.81,0.8078,0.666667,0.571429
KNN,0.836,0.835,0.822785,0.488722
CART,0.812,0.8068,0.654762,0.620301
SVM,0.811,0.8056,0.675799,0.556391
RF,0.815,0.8118,0.703518,0.526316


For final model I am going to choose Random forest. Although KNN had higher accuracy score on my test data, I feel like RF would be more consistent for any sample of data as per cross validation results.

## Final Test

In [None]:
file = input('Input your csv file here: ')
test_dt = pd.read_csv(file)
test_dt = test_dt[set(test_dt.columns).intersection(df.columns)]

In [None]:
xTest,yTest = preprocessing(test_dt)
xTrain,yTrain = preprocessing(df)

final_model = RandomForestRegressor(n_estimators=100, max_depth=5).fit(xTrain,yTrain)
predict_test = final_model.predict(xTest)
predict_train = final_model.predict(xTrain)

print('Test Accuracy:', metrics.accuracy_score(yTest, predict_test.round()))
print('Train Accuracy:', metrics.accuracy_score(yTrain, predict_train.round()))
print('Precision', metrics.precision_score(yTest, predict_test.round()))
print('Recall:', metrics.recall_score(yTest, predict_test.round()))