# EDA and Initial model building

Loading in libraries

In [72]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, train_test_split
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

Loading in data in a pandas dataframe

In [73]:
df = pd.read_csv('../data/bigml_59c28831336c6604c800002a.csv')

Classes

In [74]:
 class ModelWithCV(): #class acquired through Flatiron School
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

Looking over in the Original dataframe before I alter

In [75]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [76]:
df.describe()

Unnamed: 0,account length,area code,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   

In [78]:
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [79]:
columns = list(df.columns)
[{(x, (len(df[x].value_counts()))): [df[x].value_counts()]} for x in columns]  
#Able to view the value_counts in each column

[{('state',
   51): [WV    106
   MN     84
   NY     83
   AL     80
   OR     78
   WI     78
   OH     78
   VA     77
   WY     77
   CT     74
   MI     73
   ID     73
   VT     73
   TX     72
   UT     72
   IN     71
   MD     70
   KS     70
   NJ     68
   NC     68
   MT     68
   CO     66
   WA     66
   NV     66
   MA     65
   RI     65
   MS     65
   AZ     64
   MO     63
   FL     63
   ME     62
   ND     62
   NM     62
   DE     61
   NE     61
   OK     61
   SC     60
   SD     60
   KY     59
   IL     58
   NH     56
   AR     55
   GA     54
   DC     54
   TN     53
   HI     53
   AK     52
   LA     51
   PA     45
   IA     44
   CA     34
   Name: state, dtype: int64]},
 {('account length',
   212): [105    43
   87     42
   93     40
   101    40
   90     39
          ..
   191     1
   199     1
   215     1
   221     1
   2       1
   Name: account length, Length: 212, dtype: int64]},
 {('area code',
   3): [415    1655
   510     840
   408     

Thoughts after a brief lookover in `.head()` , `.describe()` , and `.info()`
-  state and area code need to be one hot encoded
-  international plan and voice mail plan need to be converted from yes/no to binary 1/0
-  churn needs to be converted from True/False to 1/0???
-  don't need phone number column for making models.  Will `.drop()` because it will not provide helpful information for the model
-  All other columns will probably use in making model
-  Will need to scale depending on which estimators used in model building
-  Nothing seems too extreme or out of the normal in the `.describe()` so probably wont drop any outliers
-  No null values seen in `.info()`
    -Will do further exploration looking for empty ( ' ' ) cells or other variations of missing values
-  Will still do further exploration on other columns    

### Looking at the categorical columns that will need to be One Hot Encoded

In [80]:
len(df['state'].value_counts())

51

51 state???? Will need to make sure one is not repeated or typo.  If so, will need to combine with the correct state.

In [81]:
df['state'].value_counts().sort_index() 

AK     52
AL     80
AR     55
AZ     64
CA     34
CO     66
CT     74
DC     54
DE     61
FL     63
GA     54
HI     53
IA     44
ID     73
IL     58
IN     71
KS     70
KY     59
LA     51
MA     65
MD     70
ME     62
MI     73
MN     84
MO     63
MS     65
MT     68
NC     68
ND     62
NE     61
NH     56
NJ     68
NM     62
NV     66
NY     83
OH     78
OK     61
OR     78
PA     45
RI     65
SC     60
SD     60
TN     53
TX     72
UT     72
VA     77
VT     73
WA     66
WI     78
WV    106
WY     77
Name: state, dtype: int64

51 because including DC??? I believe they are refering to the District of Columbia.  This is fine.  I will leave this column alone until One hot encoding 

In [82]:
df['area code'].value_counts(normalize=False), df['area code'].value_counts(normalize=True),

(415    1655
 510     840
 408     838
 Name: area code, dtype: int64,
 415    0.496550
 510    0.252025
 408    0.251425
 Name: area code, dtype: float64)

Looks fine to leave this column alone until one hot encoding

.

.

.

### Looking at the Binary columns that will need to be converted to 1s and 0s

In [83]:
df.churn.value_counts(), df.churn.value_counts(normalize=True) #confirming binary before converting

(False    2850
 True      483
 Name: churn, dtype: int64,
 False    0.855086
 True     0.144914
 Name: churn, dtype: float64)

In [84]:
df.loc[df['churn'] == True, 'churn'] = 1  #converting True to 1
df.loc[df['churn'] == False, 'churn'] = 0  #converting False to 0
df['churn'].value_counts()

0    2850
1     483
Name: churn, dtype: int64

In [85]:
df['international plan'].value_counts(), df['international plan'].value_counts(normalize=True) 
#confirming binary before converting

(no     3010
 yes     323
 Name: international plan, dtype: int64,
 no     0.90309
 yes    0.09691
 Name: international plan, dtype: float64)

In [86]:
df['international plan'] = df['international plan'].str.replace('yes','1') #replacing yes with 1
df['international plan'] = df['international plan'].str.replace('no','0') #replacing no with 0
df['international plan'].value_counts()

0    3010
1     323
Name: international plan, dtype: int64

In [87]:
df['voice mail plan'].value_counts(), df['voice mail plan'].value_counts(normalize=True) #confirming binary before converting

(no     2411
 yes     922
 Name: voice mail plan, dtype: int64,
 no     0.723372
 yes    0.276628
 Name: voice mail plan, dtype: float64)

In [88]:
df['voice mail plan'] = df['voice mail plan'].str.replace('yes','1') #replacing yes with 1
df['voice mail plan'] = df['voice mail plan'].str.replace('no','0') #replacing no with 0
df['voice mail plan'].value_counts()

0    2411
1     922
Name: voice mail plan, dtype: int64

In [89]:
df.head()  # looking at dataframe to make sure changes were made

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,137,415,358-1921,0,0,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,408,375-9999,1,0,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,415,330-6626,1,0,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


.

.

.

In [90]:
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [91]:
df.drop(columns='phone number', inplace=True) 
#removing the column 'phone number' because I don't think an ID equivilant information will be useful for model

Checking to make sure 'phone number' is removed

In [22]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   international plan      3333 non-null   object 
 4   voice mail plan         3333 non-null   object 
 5   number vmail messages   3333 non-null   int64  
 6   total day minutes       3333 non-null   float64
 7   total day calls         3333 non-null   int64  
 8   total day charge        3333 non-null   float64
 9   total eve minutes       3333 non-null   float64
 10  total eve calls         3333 non-null   int64  
 11  total eve charge        3333 non-null   float64
 12  total night minutes     3333 non-null   float64
 13  total night calls       3333 non-null   int64  
 14  total night charge      3333 non-null   

.

.

.

### Checking for other variations of missing values in dataframe 

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   international plan      3333 non-null   object 
 4   voice mail plan         3333 non-null   object 
 5   number vmail messages   3333 non-null   int64  
 6   total day minutes       3333 non-null   float64
 7   total day calls         3333 non-null   int64  
 8   total day charge        3333 non-null   float64
 9   total eve minutes       3333 non-null   float64
 10  total eve calls         3333 non-null   int64  
 11  total eve charge        3333 non-null   float64
 12  total night minutes     3333 non-null   float64
 13  total night calls       3333 non-null   int64  
 14  total night charge      3333 non-null   

In [93]:
np.where(df.applymap(lambda x: x == ' '))

(array([], dtype=int64), array([], dtype=int64))

In [96]:
np.where(df.applymap(lambda x: x == 'NA'))

(array([], dtype=int64), array([], dtype=int64))

In [97]:
np.where(df.applymap(lambda x: x == 'nan'))

(array([], dtype=int64), array([], dtype=int64))

In [98]:
np.where(df.applymap(lambda x: x == 'NaN'))

(array([], dtype=int64), array([], dtype=int64))

In [94]:
df = df.replace(' ', np.nan)  

In [95]:
df.isna().sum()

state                     0
account length            0
area code                 0
international plan        0
voice mail plan           0
number vmail messages     0
total day minutes         0
total day calls           0
total day charge          0
total eve minutes         0
total eve calls           0
total eve charge          0
total night minutes       0
total night calls         0
total night charge        0
total intl minutes        0
total intl calls          0
total intl charge         0
customer service calls    0
churn                     0
dtype: int64

After further examination, there still appears to be no missing values in this dataset.

.

.

.

### One Hot Encoding

In [100]:
df[['state','area code']] #Looking at the Categorical columns

Unnamed: 0,state,area code
0,KS,415
1,OH,415
2,NJ,415
3,OH,408
4,OK,415
...,...,...
3328,AZ,415
3329,WV,415
3330,RI,510
3331,CT,510


In [106]:
cat_cols = df[['state','area code']]  #subsetting columns that will be transformed

ohe = OneHotEncoder(
    drop='first',
    sparse=False)  #instantiate encoder

dums = ohe.fit_transform(cat_cols)
dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=cat_cols.index)
df_clean = pd.concat([df, dums_df], axis=1)  #concatinating created OHE dataframe with old datafra
df_clean.drop(columns=cat_cols, inplace=True) #remooving orignal columns after beinhg OHE
df_clean #looking at dataframe if it performed correctly 



Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,x0_TX,x0_UT,x0_VA,x0_VT,x0_WA,x0_WI,x0_WV,x0_WY,x1_415,x1_510
0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,107,0,1,26,161.6,123,27.47,195.5,103,16.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,137,0,0,0,243.4,114,41.38,121.2,110,10.30,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,84,1,0,0,299.4,71,50.90,61.9,88,5.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,75,1,0,0,166.7,113,28.34,148.3,122,12.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,0,1,36,156.2,77,26.55,215.5,126,18.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3329,68,0,0,0,231.1,57,39.29,153.4,55,13.04,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3330,28,0,0,0,180.8,109,30.74,288.8,58,24.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3331,184,1,0,0,213.8,105,36.35,159.6,84,13.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [30]:
# X_train_cat = X_train.select_dtypes('object')

# ohe = OneHotEncoder(
#     drop='first',
#     sparse=False)

# dums = ohe.fit_transform(X_train_cat)
# dums_df = pd.DataFrame(dums,
#                        columns=ohe.get_feature_names(),
#                        index=X_train_cat.index)
# X_train_nums = X_train.select_dtypes('float64')

# ss = StandardScaler()

# ss.fit(X_train_nums)
# nums_df = pd.DataFrame(ss.transform(X_train_nums),
#                       index=X_train_nums.index)
# X_train_clean = pd.concat([nums_df, dums_df], axis=1)

In [103]:
X = df_clean.drop(columns='churn')
y = df_clean.churn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=3)

Baseline model/ Dummy Model:

In [105]:
dr = DummyClassifier()

dummy_model = dr.fit(X_train, y_train)
dummy_train =dr.score(X_train,y_train)
dummy_test = dr.score(X_test,y_test)

print(f'Train: {dummy_train}')
print(f'Test : {dummy_test}')

Train: 0.8486341244961935
Test : 0.8681818181818182


Looking into target having a class imbalance

In [110]:
df.churn.value_counts(), df.churn.value_counts(normalize=True)

(0    2850
 1     483
 Name: churn, dtype: int64,
 0    0.855086
 1    0.144914
 Name: churn, dtype: float64)

There is a class imbalance so will use SMOTE to generate synthetic data to supplement the minority class

In [34]:
sm = SMOTE(sampling_strategy='minority', random_state=3)
X_resmp, y_resmp = sm.fit_resample(X_train, y_train)

y_resmp.value_counts()

1    1895
0    1895
Name: churn, dtype: int64

First simple model/ LogisticRegression Model :

In [35]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.8472906403940886, 0.8636363636363636)

First model is slightly worse than the Baseline model

First complex model / RandomForestClassifier:

In [39]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
forest = ModelWithCV(rfc, 'forest_pipe', X_train, y_train)
rfc.score(X_train,y_train) , forest.cv_results

(1.0,
 array([0.92410714, 0.93303571, 0.92410714, 0.94170404, 0.93273543,
        0.92825112, 0.95067265, 0.94618834, 0.94618834, 0.9103139 ]))

Better accuracy than baseline and previous model but model is overfit.

Hyperparamter tuning:

In [40]:
rfc = RandomForestClassifier(max_depth=5)
rfc.fit(X_train,y_train)
forest = ModelWithCV(rfc, 'forest_pipe', X_train, y_train)
rfc.score(X_train,y_train) , forest.cv_results

(0.8831168831168831,
 array([0.86160714, 0.875     , 0.86607143, 0.89686099, 0.88340807,
        0.86098655, 0.86995516, 0.90134529, 0.87443946, 0.85650224]))

Better fit than the previous model and better accuracy than the baseline