In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix

In [2]:
ChurnData = pd.read_csv('.\\files_for_lab\\Customer-Churn.csv')
ChurnData.head(40)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes
5,Female,0,No,No,8,Yes,No,No,Yes,No,Yes,Yes,Month-to-month,99.65,820.5,Yes
6,Male,0,No,Yes,22,Yes,No,Yes,No,No,Yes,No,Month-to-month,89.1,1949.4,No
7,Female,0,No,No,10,No,Yes,No,No,No,No,No,Month-to-month,29.75,301.9,No
8,Female,0,Yes,No,28,Yes,No,No,Yes,Yes,Yes,Yes,Month-to-month,104.8,3046.05,Yes
9,Male,0,No,Yes,62,Yes,Yes,Yes,No,No,No,No,One year,56.15,3487.95,No


In [3]:
ChurnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
ChurnData.shape

(7043, 16)

In [5]:
ChurnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [6]:
ChurnData['TotalCharges']

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: TotalCharges, Length: 7043, dtype: object

In [7]:
ChurnData['TotalCharges']= ChurnData['TotalCharges'].str.replace(' ', '0')
#ChurnData['TotalCharges'].fillna(0, inplace = True)

In [8]:
ChurnData["TotalCharges"] = pd.to_numeric(ChurnData["TotalCharges"])

In [9]:
ChurnData['TotalCharges'] = pd.to_numeric(ChurnData['TotalCharges'], errors = 'coerce')

In [10]:
ChurnData.isna().sum()/len(ChurnData)*100

gender              0.0
SeniorCitizen       0.0
Partner             0.0
Dependents          0.0
tenure              0.0
PhoneService        0.0
OnlineSecurity      0.0
OnlineBackup        0.0
DeviceProtection    0.0
TechSupport         0.0
StreamingTV         0.0
StreamingMovies     0.0
Contract            0.0
MonthlyCharges      0.0
TotalCharges        0.0
Churn               0.0
dtype: float64

###### Scale the features either by using normalizer or a standard scaler.

In [12]:
churn_num = ChurnData.select_dtypes(include = np.number)
churn_cat = ChurnData.select_dtypes(include = object)

In [13]:
scaler = MinMaxScaler()
numerical_scaled = scaler.fit_transform(churn_num)
numerical_scaled = pd.DataFrame(numerical_scaled, columns = churn_num.columns)
numerical_scaled

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0.0,0.013889,0.115423,0.003437
1,0.0,0.472222,0.385075,0.217564
2,0.0,0.027778,0.354229,0.012453
3,0.0,0.625000,0.239303,0.211951
4,0.0,0.027778,0.521891,0.017462
...,...,...,...,...
7038,0.0,0.333333,0.662189,0.229194
7039,0.0,1.000000,0.845274,0.847792
7040,0.0,0.152778,0.112935,0.039892
7041,1.0,0.055556,0.558706,0.035303


###### Split the data into a training set and a test set.

In [15]:
X = numerical_scaled
y = ChurnData['Churn']
X_train, X_test, y_train, y_test = train_test_split(X,y)

###### Fit a logistic regression model on the training data.

In [16]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)

LogisticRegression(random_state=0)

###### Check the accuracy on the test data.

In [17]:
print('accuracy:', LR.score(X_train, y_train))
og_accuracy= LR.score(X_train, y_train)

accuracy: 0.7926921620598258


In [18]:
y_pred = LR.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1180,  111],
       [ 264,  206]], dtype=int64)

###### Check for the imbalance.

In [20]:
ChurnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

###### Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.

In [21]:
churn = pd.concat([X, y], axis = 1)
churn

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.013889,0.115423,0.003437,No
1,0.0,0.472222,0.385075,0.217564,No
2,0.0,0.027778,0.354229,0.012453,Yes
3,0.0,0.625000,0.239303,0.211951,No
4,0.0,0.027778,0.521891,0.017462,Yes
...,...,...,...,...,...
7038,0.0,0.333333,0.662189,0.229194,No
7039,0.0,1.000000,0.845274,0.847792,No
7040,0.0,0.152778,0.112935,0.039892,No
7041,1.0,0.055556,0.558706,0.035303,Yes


In [22]:
category_0 = churn[churn['Churn'] == 'No']
category_1 = churn[churn['Churn'] == 'Yes']

In [23]:
category_0_undersampled = resample(category_0, 
                              replace=False, 
                              n_samples = len(category_1))

In [24]:
print(category_0_undersampled.shape)
print(category_1.shape)

(1869, 5)
(1869, 5)


###### Downsampling

In [25]:
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

In [26]:
data_downsampled['Churn'].value_counts()

No     1869
Yes    1869
Name: Churn, dtype: int64

In [27]:
X = data_downsampled.drop('Churn', axis = 1)
y = data_downsampled['Churn']
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [28]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [29]:
print('downsampled accuracy:', LR.score(X_train, y_train))
d_accuracy = LR.score(X_train, y_train)

downsampled accuracy: 0.7313592579379237


In [30]:
y_pred = LR.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[354, 124],
       [100, 357]], dtype=int64)

In [31]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [32]:
print(category_0.shape)
print(category_1_oversampled.shape)

(5174, 5)
(5174, 5)


##### upsampling

In [33]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

In [34]:
data_upsampled['Churn'].value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [35]:
X = data_upsampled.drop('Churn', axis = 1)
y = data_upsampled['Churn']
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [36]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [37]:
print('upsampled accuracy:', LR.score(X_train, y_train))
u_accuracy = LR.score(X_train, y_train)

upsampled accuracy: 0.7295451617059657


In [38]:
y_pred = LR.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[944, 366],
       [320, 957]], dtype=int64)

In [39]:
model = ['original', 'uppsampled', 'downsampled']
accuracy = [og_accuracy, u_accuracy, d_accuracy]
accuracy = pd.DataFrame([accuracy], columns = model).T
accuracy.columns = ['accuracy score']
accuracy

Unnamed: 0,accuracy score
original,0.792692
uppsampled,0.729545
downsampled,0.731359
