In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score

%matplotlib inline

In [9]:
data = pd.read_csv('/Users/jennihawk/Documents/Data Science/Classification/Churn Project/Models/chatr_clean.csv')

In [10]:
data.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,InternetService_Fiber,InternetService_No,Contract_One_Year,Contract_Two_year,PaymentMethod_Crcard,...,DeviceProtection_No_internet_serv,DeviceProtection_Yes,TechSupport_No_internet_serv,TechSupport_Yes,StreamingTV_No_internet_serv,StreamingTV_Yes,StreamingMovies_No_internet_serv,StreamingMovies_Yes,PaperlessBilling_Yes,Churn_Yes
0,7590-VHVEG,0,1.0,29.85,29.85,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,5575-GNVDE,0,34.0,56.95,1889.5,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,3668-QPYBK,0,2.0,53.85,108.15,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,7795-CFOCW,0,45.0,42.3,1840.75,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
4,9237-HQITU,0,2.0,70.7,151.65,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [11]:
data.columns

Index(['customerID', 'SeniorCitizen', 'tenure', 'MonthlyCharges',
       'TotalCharges', 'InternetService_Fiber', 'InternetService_No',
       'Contract_One_Year', 'Contract_Two_year', 'PaymentMethod_Crcard',
       'PaymentMethod_Electr_Check', 'PaymentMethod_Mailed_check',
       'MultipleLines_No_phone_serv', 'MultipleLines_Yes', 'Dependents_Yes',
       'gender_Male', 'Partner_Yes', 'PhoneService_Yes',
       'OnlineSecurity_No_internet_serv', 'OnlineSecurity_Yes',
       'OnlineBackup_No_Internet_Serv', 'OnlineBackup_Yes',
       'DeviceProtection_No_internet_serv', 'DeviceProtection_Yes',
       'TechSupport_No_internet_serv', 'TechSupport_Yes',
       'StreamingTV_No_internet_serv', 'StreamingTV_Yes',
       'StreamingMovies_No_internet_serv', 'StreamingMovies_Yes',
       'PaperlessBilling_Yes', 'Churn_Yes'],
      dtype='object')

### Sklearn defaults to keep in mind
- Decision Boundary: 50% cutoff value for positive / negative class
- Regularization built in
- Hyperparameter C controls effect of model term. Default C=1.0
- Set C through validation or cross-validation. Larger C allows model to be more complex.
- It's just like the alpha parameter from linear regression regularization, except it multiplies the model term instead of the regularization term. 
- the regularization term can be the squares of the coefficient betas, like ridge regression, - or it could be the absolute values of those betas, like lasso.
- Option to set penalty. Penalty refers to type of regularization penalty:
- L2 default means we're squaring the coefficients
- L1 set for absolute values. You can also set to elastic net penalty. Or completely turn off regularization. 

In [12]:
features_in = ['SeniorCitizen', 'tenure', 'MonthlyCharges',
       'TotalCharges', 'InternetService_Fiber', 'InternetService_No',
       'Contract_One_Year', 'Contract_Two_year', 'PaymentMethod_Crcard',
       'PaymentMethod_Electr_Check', 'PaymentMethod_Mailed_check',
       'MultipleLines_No_phone_serv', 'MultipleLines_Yes', 'Dependents_Yes',
       'gender_Male', 'Partner_Yes', 'PhoneService_Yes',
       'OnlineSecurity_No_internet_serv', 'OnlineSecurity_Yes',
       'OnlineBackup_No_Internet_Serv', 'OnlineBackup_Yes',
       'DeviceProtection_No_internet_serv', 'DeviceProtection_Yes',
       'TechSupport_No_internet_serv', 'TechSupport_Yes',
       'StreamingTV_No_internet_serv', 'StreamingTV_Yes',
       'StreamingMovies_No_internet_serv', 'StreamingMovies_Yes',
       'PaperlessBilling_Yes']

y = data['Churn_Yes']
X = data[features_in]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [13]:
pipe.fit(X_train, y_train)  # apply scaling on training data
#not sure what this line is doing:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

NameError: name 'Pipeline' is not defined

### Accuracy metric on test data

In [14]:
pipe.score(X_test, y_test)  # apply scaling on testing data, without leaking training data.

0.7895335608646189

In [15]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5274, 30)
(1758, 30)
(5274,)
(1758,)


#### Fit model to training data

In [16]:
#logreg.fit(X_train,y_train)

### See how our training data is doing

##### Average Rate of Churn on Test Data

In [17]:
np.mean(y_train)

0.26753886992794845

#### Hard Class Predictions

In [18]:
pipe.predict(X_train)[:5]

array([0, 0, 0, 0, 0])

#### Soft Class Predictions

In [19]:
pipe.predict_proba(X_train)[:5]

array([[0.79618131, 0.20381869],
       [0.75795558, 0.24204442],
       [0.62957169, 0.37042831],
       [0.89523733, 0.10476267],
       [0.94672815, 0.05327185]])

### Accuracy

In [20]:
pipe.score(X_train, y_train)

0.8065984072810012

### Confusion Matrix

In [21]:
# y_pred = pipe.predict(y_train)
# confusion_matrix(y, y_pred)

### Interpreting Coefficients
- One unit of increase in x actually increases the log odds by beta units. 
- In other words: One unit of increase in x, increases the odds by an exponential factor of beta.
- If the features coefficient beta is positive, increasing that feature makes the positive class more likely
- If beta is negative, increasing the feature does the opposite and the positive class becomes less likely

In [22]:
pipe.coef_, pipe.intercept_

AttributeError: 'Pipeline' object has no attribute 'coef_'