In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
from sklearn.tree import export_graphviz

import os
from env import host, user, password

## Acquire

In [24]:
# The following functions will be used to acquire all the data from the SQL database 'telco_churn'
# These functions will also be stored in the acquire.py file

def get_connection(db, user=user, host=host, password=password):
    '''
    This function uses my info from my env file to
    create a connection url to access the Codeup db.
    It takes in a string name of a database as an argument.
    '''
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

def new_telco_data():
    '''
    This function reads the telco data from the Codeup db into a df,
    writes it to a csv file, and returns the df.
    '''

    # Create SQL query.
    sql_query = """
           SELECT c.customer_id,
                c.gender, 
                c.senior_citizen,
                c.partner,
                c.dependents,
                c.tenure,
                c.phone_service,
                c.multiple_lines,
                c.online_security,
                c.device_protection,
                c.tech_support,
                c.streaming_tv,
                c.streaming_movies,
                c.paperless_billing,
                c.monthly_charges,
                c.total_charges,
                c.churn,
                ct.contract_type,
                i.internet_service_type,
                p.payment_type
FROM customers as c
JOIN contract_types as ct USING (contract_type_id)
JOIN internet_service_types as i USING (internet_service_type_id)
JOIN payment_types as p USING (payment_type_id);
                """
    
    # Read in DataFrame from Codeup db.
    df = pd.read_sql(sql_query, get_connection('telco_churn'))
    
    return df

def get_telco_data():
    '''
    This function reads in telco data from Codeup database, writes data to
    a csv file if a local file does not exist, and returns a df.
    '''
    if os.path.isfile('telco_df.csv'):
        
        # If csv file exists, read in data from csv file.
        df = pd.read_csv('telco_df.csv', index_col=0)
        
    else:
        
        # Read fresh data from db into a DataFrame.
        df = new_telco_data()
        
        # Write DataFrame to a csv file.
        df.to_csv('telco_df.csv')
        
    return df

telco_db = get_telco_data()
df = telco_db.copy()

In [25]:
df[df.total_charges==' ']

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
85,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,Yes,Yes,Yes,Yes,No,No,56.05,,No,Two year,DSL,Credit card (automatic)
156,2775-SEFEE,Male,0,No,Yes,0,Yes,Yes,Yes,No,Yes,No,No,Yes,61.9,,No,Two year,DSL,Bank transfer (automatic)
236,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,No,Yes,Yes,Yes,No,No,73.35,,No,Two year,DSL,Mailed check
255,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,Yes,Yes,Yes,Yes,No,Yes,52.55,,No,Two year,DSL,Bank transfer (automatic)
339,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,Yes,Yes,No,Yes,Yes,No,80.85,,No,Two year,DSL,Mailed check
5681,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.0,,No,Two year,,Mailed check
5717,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No,20.25,,No,Two year,,Mailed check
5727,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.35,,No,Two year,,Mailed check
5798,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No,25.75,,No,Two year,,Mailed check
6007,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.85,,No,Two year,,Mailed check


In [26]:
def gen_view(df):
    """
    This function will give a general overview of a dataframe.
    This includes:
        - statistical description of the df's numerical values
        - info about df's columns and their values
        - dimensions (rows x columns) of df
        - if any null values exist in each column
        - if any observations/rows are duplicated
        """
    print('------------------------------')
    print('General overview of dataframe.')
    print('------------------------------\n')
    print('Descriptive stats:\n')
    print(df.describe())
    print('\n')
    print('Column and row info:')
    print(df.info())
    print('\n')
    print('Dimensions of df:')
    print(df.shape)
    print('\n')
    print('Null values:')
    print(df.isnull().sum())
    print('\n')
    dups = df['customer_id'].duplicated().any()
    print('Any duplicates:', dups)

In [27]:
gen_view(df)

------------------------------
General overview of dataframe.
------------------------------

Descriptive stats:

       senior_citizen       tenure  monthly_charges
count     7043.000000  7043.000000      7043.000000
mean         0.162147    32.371149        64.761692
std          0.368612    24.559481        30.090047
min          0.000000     0.000000        18.250000
25%          0.000000     9.000000        35.500000
50%          0.000000    29.000000        70.350000
75%          0.000000    55.000000        89.850000
max          1.000000    72.000000       118.750000


Column and row info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   customer_id            7043 non-null   object 
 1   gender                 7043 non-null   object 
 2   senior_citizen         7043 non-null   int64  
 3   partner                

## Prepare

In [28]:
df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0016-QLJIS,Female,0,Yes,Yes,65,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,90.45,5957.9,No,Two year,DSL,Mailed check
1,0017-DINOC,Male,0,No,No,54,No,No phone service,Yes,No,Yes,Yes,No,No,45.2,2460.55,No,Two year,DSL,Credit card (automatic)
2,0019-GFNTW,Female,0,No,No,56,No,No phone service,Yes,Yes,Yes,No,No,No,45.05,2560.1,No,Two year,DSL,Bank transfer (automatic)
3,0056-EPFBG,Male,0,Yes,Yes,20,No,No phone service,Yes,Yes,Yes,No,No,Yes,39.4,825.4,No,Two year,DSL,Credit card (automatic)
4,0078-XZMHT,Male,0,Yes,No,72,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,85.15,6316.2,No,Two year,DSL,Bank transfer (automatic)


In [15]:
def check_v_counts(df):
    x = []
    y = []
    for i in range(len(df.columns)):
        if df[df.columns[i]].dtype == 'object':
            print(f'{df.columns[i]}:\n{df[df.columns[i]].sort_values().value_counts()}\n')
            print('----------\n')
        elif df[df.columns[i]].dtype == 'int64':
            print(f'{df.columns[i]}:\n{df[df.columns[i]].value_counts(bins=5, sort=False)}\n')
            print('----------\n')
check_v_counts(df)

customer_id:
5074-FBGHB    1
8429-XIBUM    1
0133-BMFZO    1
2468-SJFLM    1
5275-PMFUT    1
             ..
8945-GRKHX    1
6933-FHBZC    1
7206-GZCDC    1
2215-ZAFGX    1
7841-FCRQD    1
Name: customer_id, Length: 7043, dtype: int64

----------

gender:
Male      3555
Female    3488
Name: gender, dtype: int64

----------

senior_citizen:
(-0.002, 0.2]    5901
(0.2, 0.4]          0
(0.4, 0.6]          0
(0.6, 0.8]          0
(0.8, 1.0]       1142
Name: senior_citizen, dtype: int64

----------

partner:
No     3641
Yes    3402
Name: partner, dtype: int64

----------

dependents:
No     4933
Yes    2110
Name: dependents, dtype: int64

----------

tenure:
(-0.073, 14.4]    2371
(14.4, 28.8]      1126
(28.8, 43.2]       989
(43.2, 57.6]       947
(57.6, 72.0]      1610
Name: tenure, dtype: int64

----------

phone_service:
Yes    6361
No      682
Name: phone_service, dtype: int64

----------

multiple_lines:
No                  3390
Yes                 2971
No phone service     682
Name: 

In [16]:
df[df.columns[14]].value_counts(bins=5, sort=False)

(18.148999999999997, 38.35]    1797
(38.35, 58.45]                 1005
(58.45, 78.55]                 1367
(78.55, 98.65]                 1826
(98.65, 118.75]                1048
Name: monthly_charges, dtype: int64

Clean:

gender: genderMale
- remove gender
- no female column

internet_service_type: 
- remove i.s.t.
- no DSL column

payment_type: 
- remove p.t.
- no bank trnsfer column (auto)

In [30]:
def  prep_data(df):
    '''
    This function prepares and cleans the data by:
        - drops rows with empty total_charges values
        - replaces "Yes" or "No" values with 1 or 0, respectively
        - converts categorical vars to dummy vars then deletes first newly created dummy variable
            > combines dummy columns with original df
        - repeats same dummy var process with deleting any new columns
            > combines dummy columns with original df
        - drops any existing duplicate rows
        - drops unsuable or unnecessary columns
        - creates new column 'tenure_years'
    '''
    # replace empty spaces with 0.00 and convert total_charges to float dtype
    df.drop(index=df[df.total_charges==' '].index.values.tolist(), inplace=True)
    
    # replace yes/no with 1/ 0
    yes_no_cols = ['paperless_billing', 'partner', 'dependents', 'phone_service', 'churn']
    other_cols = ['online_security', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies']
    for i in yes_no_cols:
        df.replace({i: {'Yes': 1, 'No': 0}}, inplace=True)
    for j in other_cols:
        df.replace({j: {'Yes': 1, 'No': 0, 'No internet service': 0}}, inplace=True)
    df.replace({'multiple_lines': {'Yes': 1, 'No': 0, 'No phone service': 0}}, inplace=True)

    # creates dummy vars for gender and payment_type, drops first new var, concats dummy vars with original df
    dummy_df = pd.get_dummies(df[['gender','payment_type']], dummy_na=False, drop_first=True) 
    df = pd.concat([df, dummy_df], axis=1)

    # creates dummy vars for internet_service_type and contract_type then concats dummy vars with original df
    dummy_df = pd.get_dummies(df[['internet_service_type', 'contract_type']], dummy_na=False)
    df = pd.concat([df, dummy_df], axis=1)

    # drops unusable or unecessary columns and columns with new dummy vars
    df.drop(columns=['customer_id', 'gender', 'contract_type','payment_type', 'internet_service_type', 'contract_type_Two year'], inplace=True)
    
    # adds tenure years column
    df['tenure_years'] = round(df.tenure / 12, 1)
    
    return df
df = prep_data(df)

In [None]:
df = prep_data(df)
df

No contract_type column
- No dummy month-to-month column for contract type

No payment_type column
- No dummy bank transfer payment type column

No internet_service_type column
- No dummy female hender column

In [None]:
df.info()

In [None]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test
train, validate, test = train_validate_test_split(df, 'churn', seed=123)

In [None]:
train.shape

In [None]:
validate.shape

In [None]:
test.shape

In [None]:
# Separate variables into target, categorical, and quantitative for easier exploration and viz

target = 'churn'

quant_vars = ['monthly_charges',
             'tenure',
             'total_charges']

cat_vars = list(df.columns[~df.columns.isin(quant_vars)])

In [None]:
pd.set_option('display.max_columns', None)
df.head(5)

In [None]:
df.total_charges

In [None]:
df.monthly_charges.value_counts(bins=5, sort=False)

In [None]:
df.total_charges.value_counts(bins=5, sort=False)

In [None]:
df.tenure.value_counts(bins=5, sort=False)

We will investigate the quantitative variables against churn

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x=df.tenure, y=df.monthly_charges, hue=df.churn)

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x=df.tenure, y=df.total_charges, hue=df.churn)

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x=df.monthly_charges, y=df.total_charges, hue=df.churn)

In [None]:
for quant in quant_vars:
    plt.title(f'Histogram for {quant}')
    sns.histplot(x=df[quant], hue=df.churn)
    plt.show()

In [None]:
df[df.tenure <= 12]
for quant in quant_vars:
    plt.title(f'Histogram for {quant}')
    sns.histplot(x=df[df.tenure <= 12][quant], hue=df[df.tenure <= 12].churn)
    plt.show()

In [None]:
x = df[df.monthly_charges >= 100]

In [None]:
for quant in quant_vars:
    plt.title(f'Histogram for {quant}')
    sns.histplot(x=x[quant], hue=x.churn)
    plt.show()

In [None]:
df[(df.monthly_charges)]

In [None]:
train.describe().T

In [None]:
def plot_cat_by_target(train, target, cat_var):
    p = plt.figure(figsize=(2,2))
    p = sns.barplot(cat_var, target, data=train, alpha=.8, color='lightseagreen')
    overall_rate = train[target].mean()
    p = plt.axhline(overall_rate, ls='--', color='gray')
    return p
def explore_bivariate_categorical(train, target, cat_var):
    '''
    takes in categorical variable and binary target variable, 
    returns a crosstab of frequencies
    runs a chi-square test for the proportions
    and creates a barplot, adding a horizontal line of the overall rate of the target. 
    '''
    print(cat_var, "\n_____________________\n")
    ct = pd.crosstab(train[cat_var], train[target], margins=True)
    chi2_summary, observed, expected = run_chi2(train, cat_var, target)
    p = plot_cat_by_target(train, target, cat_var)

    print(chi2_summary)
    print("\nobserved:\n", ct)
    print("\nexpected:\n", expected)
    plt.show(p)
    print("\n_____________________\n")
explore_bivariate_categorical(train, target, 'partner')

In [None]:
chi2_summary, observed, expected = run_chi2(train, 'partner', target)

In [None]:
def explore_bivariate_quant(train, target, quant_var):
    '''
    descriptive stats by each target class. 
    compare means across 2 target groups 
    boxenplot of target x quant
    swarmplot of target x quant
    '''
    print(quant_var, "\n____________________\n")
    descriptive_stats = train.groupby(target)[quant_var].describe()
    average = train[quant_var].mean()
    mann_whitney = compare_means(train, target, quant_var)
    plt.figure(figsize=(4,4))
    boxen = plot_boxen(train, target, quant_var)
    swarm = plot_swarm(train, target, quant_var)
    plt.show()
    print(descriptive_stats, "\n")
    print("\nMann-Whitney Test:\n", mann_whitney)
    print("\n____________________\n")

In [None]:
def explore_bivariate(train, target, cat_vars, quant_vars):
    plt.figure(figsize=(8,8))
    for cat in cat_vars:
        explore_bivariate_categorical(train, target, cat_vars)
    for quant in quant_vars:
        explore_bivariate_quant(train, target, quant)
explore_bivariate(train, 'churn', cat_vars, quant_vars)


**Immediate takeaways**
- Although senior citizens make up less than half of all, they are leaving at a high rate (~60%)
- 49% of those without dependents churn within 2.5 years
- 

## Random Forest

In [None]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['churn'])
y_train = train.churn

X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

X_test = test.drop(columns=['churn'])
y_test = test.churn

In [None]:
# 0 (not churned) is the most common outcome so that will be my baseline
y_train.value_counts()

In [None]:
# 1. Create the baseline
baseline = DummyClassifier(strategy='constant', constant=0) # baseline predicts no churns

# 2. Fit the baseline
baseline.fit(X_train, y_train)

#Baseline accuracy
print('Baseline accuracy of random forest classifier on training set: {:.2f}'
     .format(baseline.score(X_train, y_train)))

In [None]:
# Create the model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print('----------')
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))
print('----------')

In [None]:
# Using all features with  optimized hyperparameters

rf1 = RandomForestClassifier()
param_grid = {'max_depth': [4,5,6,7],
             'min_samples_split': [3, 6, 9, 12],
             'max_features': ['auto', 'sqrt', 'log2'],
             'min_samples_leaf': [4, 8, 12]}

clf = GridSearchCV(rf1, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best = clf.fit(X_train,y_train)
best.best_estimator_

In [None]:
print('Confusion Matrix:')
print(confusion_matrix(y_train, y_pred))
print()
print('---------')
print()
print('Classification Report:')
print(classification_report(y_train, y_pred))

In [None]:
# columns: pred_died, pred_survived
#row1: actual_died
#row2: actual_survived

con_mat = confusion_matrix(y_train, y_pred)
con_mat

In [None]:
# True positive = Survived and marked as survived
tp = con_mat[1][1]
# True negative = Died and marked as died
tn = con_mat[0][0]
# False positive = Died but marked as survived
fp = con_mat[1][0]
# False negative = Survived but marked as died
fn = con_mat[0][1]

In [None]:
tp_rate = tp / (tp+fn)
print(' True positive rate =',round(tp_rate, 4))

tn_rate = tn / (tn+fp)
print(' True negative rate =',round(tn_rate, 4))

fp_rate = fp / (tn+fp)
print('False positive rate =',round(fp_rate, 4))

fn_rate = fn / (tp+fn)
print('False negative rate =',round(fn_rate, 4))

In [None]:
def most_imp_feat(clf, df):
    z = []
    x = pd.Series(clf.feature_importances_)
    y = pd.Series(df.columns)
    for i in range(len(x)):
        if x[i] >= .03:
            z.append(y[i])
    return z
z = most_imp_feat(rf, X_train)
z

In [None]:
X_train_rf = X_train[z].copy()

X_validate_rf = X_validate[z].copy()

X_test_rf = X_test[z].copy()

In [None]:
# Using all features with  optimized hyperparameters

rf1 = RandomForestClassifier()
param_grid = {'max_depth': [4,5,6,7],
             'min_samples_split': [3, 6, 9, 12],
             'max_features': ['auto', 'sqrt', 'log2'],
             'min_samples_leaf': [4, 8, 12]}

clf = GridSearchCV(rf1, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best = clf.fit(X_train_rf, y_train)
best.best_estimator_

In [None]:
best_rf = RandomForestClassifier(max_depth=6, max_features='sqrt', min_samples_leaf=8, min_samples_split=3)
best_rf.fit(X_train_rf, y_train)
y_pred = best_rf.predict(X_train_rf)
y_pred_proba = best_rf.predict_proba(X_train_rf)
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(best_rf.score(X_train_rf, y_train)))
print('----------')
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(best_rf.score(X_validate_rf, y_validate)))
print('----------')
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(best_rf.score(X_test_rf, y_test)))

# K-Nearest Neighbor

In [None]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))
print()
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

In [None]:
k_range = range(1, 20)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))
plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20])
plt.show()

In [None]:
model = KNeighborsClassifier()
# fit the model
model.fit(X_train, y_train)
results = permutation_importance(model, X_train, y_train, scoring='accuracy')
# get importance
importance = results.importances_mean
# summarize feature importance
for i,v in enumerate(importance):
    print(X_train.columns[i], [v])

In [None]:
def most_imp_knn(X, y):
    feats = []
    model = KNeighborsClassifier()
    # fit the model
    model.fit(X, y)
    results = permutation_importance(model, X, y, scoring='accuracy')
    # get importance
    importance = results.importances_mean
    # summarize feature importance
    for i,v in enumerate(importance):
        if v > 0:
            feats.append(X.columns[i])
    return feats
feats = most_imp_knn(X_train, y_train)
feats

In [None]:
X_train_knn = X_train[feats].copy()

X_validate_knn = X_validate[feats].copy()

X_test_knn = X_test[feats].copy()

In [None]:
# Using all features with  optimized hyperparameters

knn1 = KNeighborsClassifier()
param_grid = {'n_neighbors': [5, 10, 15, 20, 25],
             'weights': ['uniform', 'distance'],
             'p': [1, 2]}

clf = GridSearchCV(knn1, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train_knn,y_train)
best_clf.best_estimator_

In [None]:
bestknn = KNeighborsClassifier(n_neighbors=25, p=1, weights='uniform')
bestknn.fit(X_train_knn, y_train)

In [None]:
y_pred = bestknn.predict(X_train_knn)
y_pred_proba = bestknn.predict_proba(X_train_knn)
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(bestknn.score(X_train_knn, y_train)))
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(bestknn.score(X_validate_knn, y_validate)))
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(bestknn.score(X_test_knn, y_test)))
print('----------')

In [None]:
print(confusion_matrix(y_train, y_pred))
print()
print(classification_report(y_train, y_pred))

## Logistic Regression

In [None]:
# Create the model

logit = LogisticRegression(C=1, random_state=123)


# Fit the model

logit.fit(X_train, y_train)

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

In [None]:
# Make predictions
y_pred = logit.predict(X_train)

# Give probability of survival
y_pred_proba = logit.predict_proba(X_train)

In [None]:
# Evaluate the model

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))
print()
print('---------')
print()
print('Confusion Matrix:')
print(confusion_matrix(y_train, y_pred))
print()
print('---------')
print()
print('Classification Report:')
print(classification_report(y_train, y_pred))

In [None]:
# Using all features with  optimized hyperparameters

logit1 = LogisticRegression()
param_grid = {'C': [.5, 1, 2,  10],
              'penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'class_weight': ['dict', 'balanced']}
clf = GridSearchCV(logit1, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train,y_train)
best_clf.best_estimator_

In [None]:
bestlogit = LogisticRegression(C=0.5, class_weight='dict', penalty='none', random_state=123)
bestlogit.fit(X_train, y_train)
# Make predictions
y_pred = bestlogit.predict(X_train)

# Give probability of survival
y_pred_proba = bestlogit.predict_proba(X_train)

# Evaluate the model

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(bestlogit.score(X_train, y_train)))

In [None]:
print('---------')
print()
print('Confusion Matrix:')
print(confusion_matrix(y_train, y_pred))
print()
print('---------')
print()
print('Classification Report:')
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of Logistic Regression classifier on validate set: {:.2f}'
     .format(bestlogit.score(X_validate, y_validate)))
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'
     .format(bestlogit.score(X_test, y_test)))

In [None]:
X_train_knn.columns

In [None]:
X_train_logit.columns

In [None]:
X_train_rf.columns

In [None]:
def most_imp_logit(X,y):
    feats = []
    model = LogisticRegression()
    # fit the model
    model.fit(X, y)
    # get importance
    importance = model.coef_[0]
    # summarize feature importance
    for i,v in enumerate(importance):
        if v >=.05 or v <= -.05:
            feats.append(X.columns[i])
    return feats
feats = most_imp_logit(X_train,y_train)
feats

In [None]:
X_train[feats]

In [None]:
X_train_logit = X_train[feats].copy()

X_validate_logit = X_validate[feats].copy()

X_test_logit = X_test[feats].copy()

In [None]:
# Using all features with  optimized hyperparameters

logit1 = LogisticRegression()
param_grid = {'C': [.5, 1, 2, 10, 15],
              'penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'class_weight': ['dict', 'balanced']}
clf = GridSearchCV(logit1, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train_logit, y_train)
best_clf.best_estimator_

In [None]:
bestlogit = LogisticRegression(C=1, random_state=250)
bestlogit.fit(X_train, y_train)
# Make predictions
y_pred = bestlogit.predict(X_train)

# Give probability of survival
y_pred_proba = bestlogit.predict_proba(X_train)

# Evaluate the model

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(bestlogit.score(X_train, y_train)))