In [2]:
## Importing libraries ##

# Data manipulation
import pandas as pd
import numpy as np

# Data visualisation
import seaborn as sns
import matplotlib as plt

In [1]:
## Scikit-learn packages ##

# Validation framework
from sklearn.model_selection import train_test_split
# Feature matrix formatter (dictionary vectoriser)
from sklearn.feature_extraction import DictVectorizer
# Logistic regression (sigmoid ver. linear regression)
from sklearn.linear_model import LogisticRegression

In [17]:
# Importing the data
df = pd.read_csv(r'..\Week 3\telco-customer-churn\WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Cleaning column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Storing categorical variables
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

# Cleaning contents of categorical series
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

# Parsing totalcharges as numeric (falsely parsed as an object)
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

# Converting yes/no to 1/0 (binary)
df.churn = (df.churn == 'yes').astype(int)

In [18]:
# Validation framework using sklearn
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Defining feature matrices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Defining target-vector
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

# Removing target variable from feature matrices
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [19]:
# Numerical series
numerical = ['tenure', 'monthlycharges', 'totalcharges']

# Categorical series
categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [20]:
# Initialising sklearn dictionary vectoriser (sparse matric == False)
dv = DictVectorizer(sparse=False)

# Converting train feature matrix to a dictionary, combining categorical and numerical series
train_dict = df_train[categorical + numerical].to_dict(orient='records')
# Transforming dictionary to formatted feature matrix using DictVectoriser
X_train = dv.fit_transform(train_dict)

# Initialising logistic regression model
model = LogisticRegression()
# Training model on train dataset
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Converting validation feature matrix to a dictionary, combining categorical and numerical series
val_dict = df_val[categorical + numerical].to_dict(orient='records')
# Transforming dictionary to formatted feature matrix using DictVectoriser
X_val = dv.transform(val_dict)

# Obtaining predictions for validation set using predict_proba
y_pred = model.predict_proba(X_val)[:, 1]
# Classifying probability predictions into binary 
churn_decision = (y_pred >= 0.5)
# Evaluating binary predictions against actual outcomes (the target vector, y_val)
(y_val == churn_decision).mean()

0.8034066713981547

## Week 4