# Lab | Cross Validation

In [1]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import getpass
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder, Normalizer, LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn import tree

In [2]:
data = pd.read_csv("files_for_lab/Customer-Churn.csv")

In [3]:
data.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes


In [4]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [5]:
def checking_nulls(df):
    # This function shows which columns have null values and returns a df with only nulls
    for c in df.columns:
        null_count = df[c].isnull().sum()
        if null_count > 0:
            print ("The column ", c, " has ", null_count, " null values")
    nulls = df[df.isna().any(axis=1)]
    return nulls.head(3)

checking_nulls(data)

The column  TotalCharges  has  11  null values


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
488,Female,0,Yes,Yes,0,No,Yes,No,Yes,Yes,Yes,No,Two year,52.55,,No
753,Male,0,No,Yes,0,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,20.25,,No
936,Female,0,Yes,Yes,0,Yes,Yes,Yes,Yes,No,Yes,Yes,Two year,80.85,,No


In [6]:
def replace_nulls_mean(df):
    # This function replaces null values with the mean of the column.
    for c in df.columns:
        null_count = df[c].isnull().sum()
        if null_count > 0:
            df[c].fillna((df[c].mean()), inplace=True)
replace_nulls_mean(data)

In [7]:
X = pd.DataFrame(data[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']])
y = pd.DataFrame(data["Churn"])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [9]:
transformer = Normalizer().fit(X_train)
transformer
X_train_norm = transformer.transform(X_train)
X_train_norm = pd.DataFrame(X_train, columns=data.columns, index=data.index)

## 1. Apply SMOTE for upsampling the data

In [10]:
smote = SMOTE()

X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

### Use logistic regression to fit the model and compute the accuracy of the model.

In [11]:
logreg = LogisticRegression()
logreg.fit(X_train_sm, y_train_sm)

  return f(*args, **kwargs)


LogisticRegression()

In [12]:
logreg.fit(X_train_sm, y_train_sm)
logreg.score(X_test, y_test)

  return f(*args, **kwargs)


0.7126632595116411

### Use decision tree classifier to fit the model and compute the accuracy of the model.

In [13]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
score_tree = clf.score(X_test, y_test)
    
score_tree

0.7064168086314594

## 2. Apply TomekLinks for downsampling

### It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.

In [14]:
X_train_tl = X_train.copy()
y_train_tl = y_train.copy()

tl = TomekLinks('majority')
X_train_tl, y_train_tl = tl.fit_resample(X_train_tl, y_train_tl)



### Use logistic regression to fit the model and compute the accuracy of the model.

In [15]:
logreg.fit(X_train_tl, y_train_tl)
logreg.score(X_test, y_test)

  return f(*args, **kwargs)


0.7597955706984668

### Use decision tree classifier to fit the model and compute the accuracy of the model.

In [16]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_tl, y_train_tl)
clf.score(X_test, y_test)

0.7103918228279387

### You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.