In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('ChurnData.csv', header = 0)

In [None]:
# df.dtypes

In [None]:
# #Listing all categorical variables: 
# for variable in df: 
#     if df[variable].dtype == 'O':
#         print(variable)

In [3]:
#Creating function that names the dummy variables as I want
def dummyWithNames(variable, prefix_name):
    variable = pd.get_dummies(df[variable], drop_first = True, prefix= prefix_name)
    return variable 

In [4]:
#Executing function on churn data saving into check 
dummies = []
for variable in df: 
    if df[variable].dtype == 'O':
        variable = dummyWithNames(variable, str(variable))
        dummies.append(variable)

In [5]:
dummies = pd.concat(dummies, axis = 'columns')

In [6]:
merged = pd.concat([dummies, df], axis = 'columns')

In [7]:
merged = merged.drop(['churn', 'voice_mail_plan', 'state', 'area_code', 'international_plan'], axis = 1)
merged.shape

(5000, 70)

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(merged.drop(['churn_yes'], axis = 1),
                                                    merged.churn_yes, 
                                                    test_size = 0.3, random_state = 10)

In [65]:
X = X_train.values
y = y_train.values

## PCA with Dummy Variables

In [66]:
#Normalize the data 
from sklearn.preprocessing import StandardScaler #normalization 
scaler = StandardScaler()
scaler.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [67]:
#Create new object with scaled data 
scaled_data = scaler.transform(X)
scaled_data

array([[-0.15294382, -0.13970132, -0.12635334, ..., -0.18480092,
         0.89971603, -0.41649683],
       [-0.15294382, -0.13970132, -0.12635334, ...,  0.2197039 ,
         1.73008302, -1.18292108],
       [-0.15294382, -0.13970132, -0.12635334, ..., -0.18480092,
         2.26580365,  0.34992741],
       ...,
       [-0.15294382, -0.13970132, -0.12635334, ...,  1.43321837,
        -0.69405287, -1.18292108],
       [ 6.53834842, -0.13970132, -0.12635334, ...,  1.43321837,
        -1.0556643 , -0.41649683],
       [-0.15294382, -0.13970132, -0.12635334, ..., -0.18480092,
         0.06934904, -1.18292108]])

In [52]:
#Reminder of number of variables in dataset
scaled_data.shape

(3500, 69)

In [68]:
#Finding Components and variance explained
from sklearn.decomposition import PCA
pca = PCA(n_components = 55)
X = pca.fit_transform(scaled_data)
print('Total Explained Variance: ', pca.explained_variance_ratio_.sum())
print('Variance per PC: ', pca.explained_variance_ratio_)

Total Explained Variance:  0.9029107050220371
Variance per PC:  [0.03048895 0.02979029 0.02963843 0.02900095 0.02782798 0.02322009
 0.0171704  0.01666466 0.01632486 0.01624009 0.01622069 0.01583487
 0.01553846 0.0149656  0.0149235  0.01484958 0.01484257 0.01483882
 0.01483408 0.01482455 0.01482224 0.01481843 0.01481317 0.01480942
 0.0148052  0.01480195 0.01479708 0.01479479 0.01479019 0.01478638
 0.01478082 0.01477911 0.01477609 0.0147714  0.01476967 0.01476798
 0.01476355 0.01476317 0.01476003 0.01475624 0.01475174 0.01474849
 0.01474596 0.01474095 0.01473584 0.01473216 0.01472792 0.01471311
 0.01467813 0.01463212 0.01456407 0.01448807 0.0144251  0.0143181
 0.01414261]


In [69]:
#Creating df with PC and known labels
pc_df = pd.DataFrame(data = X, 
        columns = range(1,56,1))
dummy_pca_data_X =pc_df

## PCA Without Dummy Variables

In [None]:
merged.columns

In [70]:
keep_columns = []
for column in merged: 
    if (merged[column].dtype == 'float64') | (merged[column].dtype == 'int64')  :
        keep_columns.append(column)
# keep_columns

In [33]:
merged.shape

(5000, 70)

In [71]:
no_dummies = X_train[keep_columns].values
# no_dummies.dtypes

In [72]:
#Normalize the data 
from sklearn.preprocessing import StandardScaler #normalization 
scaler = StandardScaler()
scaler.fit(no_dummies)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [73]:
#Create new object with scaled data 
scaled_data = scaler.transform(no_dummies)

In [74]:
#Finding Components and variance explained
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
X = pca.fit_transform(scaled_data)
print('Total Explained Variance: ', pca.explained_variance_ratio_.sum())
print('Variance per PC: ', pca.explained_variance_ratio_)

Total Explained Variance:  0.9359726222999394
Variance per PC:  [0.13664569 0.13557782 0.13341811 0.12841654 0.07039171 0.06859363
 0.06722037 0.0660819  0.06550855 0.06411831]


In [76]:
#Creating df with PC and known labels
pc_df = pd.DataFrame(data = X, 
        columns = range(1,11,1))
# pc_df.head(n = 20)
pc_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,1.117264,0.860249,-3.267951,-1.514983,-0.848635,-0.662204,-0.184492,-0.354839,-0.288507,-0.057933
1,-0.534473,-1.771828,0.220257,-2.372565,-0.104382,1.877681,0.018228,-0.185421,-0.989952,-0.383239
2,-0.111025,-2.534659,-2.640587,-0.84614,0.974067,-0.18106,-0.695494,0.463899,-0.011414,0.5563
3,0.002757,0.249261,-1.242818,0.047066,-0.926719,-0.521447,0.18817,-1.870896,-0.398013,0.059174
4,1.837456,1.002739,-1.367623,2.167416,-0.471698,2.252486,-0.057525,-0.656867,-1.621744,0.047067


In [77]:
#Combine PC and categorical data for final df 
add_columns = []
for column in X_train: 
    if column not in keep_columns:
        add_columns.append(column)
len(add_columns)

54

In [79]:
add_X = X_train[add_columns].values
add_pc = pc_df.values

nodummies_pca_data_X= np.column_stack((add_X,add_pc))

# Models

### Logistic Regression

In [44]:
from sklearn.model_selection import GridSearchCV

In [81]:
#dummy then pca Log reg
lr = LogisticRegression(solver = 'lbfgs')
lr.fit(dummy_pca_data_X, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [82]:
lr.score(dummy_pca_data_X, y_train)

0.8765714285714286

In [84]:
#pca then dummy log reg 
lr = LogisticRegression(solver = 'lbfgs')
lr.fit(nodummies_pca_data_X, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [85]:
lr.score(nodummies_pca_data_X, y_train) # Same thing 

0.8757142857142857

### Random Forest

In [94]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [80, 90, 100, 110],
    'max_features': ['auto'],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
rf = RandomForestClassifier()

In [95]:
grid = GridSearchCV(rf, param_grid, cv=10, scoring='accuracy', return_train_score=False)

In [None]:
#dummy then pca
grid.fit(dummy_pca_data_X, y_train)

In [None]:
# save the model to disk
import pickle
filename = 'Churn_RF1.sav'
pickle.dump(grid, open(filename, 'wb'))
grid.score(dummy_pca_data_X, y_train)

In [None]:
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

In [None]:
#pca then dummy 
grid.fit(nodummies_pca_data_X, y_train)

In [None]:
# save the model to disk
import pickle
filename = 'Churn_RF2.sav'
pickle.dump(grid, open(filename, 'wb'))
grid.score(nodummies_pca_data_X, y_train)