In [392]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV and Perform Basic Data Cleaning

In [352]:
df = pd.read_csv("selected_df.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Number of Dependents,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,Total Charges,Total Long Distance Charges,...,Online Security code,Online Backup code,Device Protection Plan code,Premium Tech Support code,Streaming TV code,Streaming Movies code,Unlimited Data code,Payment Method code,Customer Status code,Churn Category code
0,0,37,0,2,9,42.39,16.0,65.6,593.3,381.51,...,1,1,1,1,1,1,1,1,1,1
1,1,46,0,0,9,10.69,10.0,-4.0,542.4,96.21,...,1,2,1,2,2,2,2,1,1,1
2,2,50,0,0,4,33.65,30.0,73.9,280.85,134.6,...,1,2,2,2,2,1,1,2,2,2
3,3,78,0,1,13,27.82,4.0,98.0,1237.85,361.66,...,1,1,2,2,1,2,1,2,2,3
4,4,75,0,3,3,7.38,11.0,83.9,267.4,22.14,...,1,2,1,1,1,1,1,1,2,3


In [353]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4601 entries, 0 to 6587
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         4601 non-null   int64  
 1   Age                                4601 non-null   int64  
 2   Number of Dependents               4601 non-null   int64  
 3   Number of Referrals                4601 non-null   int64  
 4   Tenure in Months                   4601 non-null   int64  
 5   Avg Monthly Long Distance Charges  4601 non-null   float64
 6   Avg Monthly GB Download            4601 non-null   float64
 7   Monthly Charge                     4601 non-null   float64
 8   Total Charges                      4601 non-null   float64
 9   Total Long Distance Charges        4601 non-null   float64
 10  Gender code                        4601 non-null   int64  
 11  Married code                       4601 non-null   int64

In [354]:
print(df.columns)

Index(['Unnamed: 0', 'Age', 'Number of Dependents', 'Number of Referrals',
       'Tenure in Months', 'Avg Monthly Long Distance Charges',
       'Avg Monthly GB Download', 'Monthly Charge', 'Total Charges',
       'Total Long Distance Charges', 'Gender code', 'Married code',
       'Offer code', 'Multiple Lines code', 'Internet Service code',
       'Internet Type code', 'Online Security code', 'Online Backup code',
       'Device Protection Plan code', 'Premium Tech Support code',
       'Streaming TV code', 'Streaming Movies code', 'Unlimited Data code',
       'Payment Method code', 'Customer Status code', 'Churn Category code'],
      dtype='object')


# Select your features (columns)

In [385]:
# Set features. This will also be used as your x values.
model_df = df.drop(['Unnamed: 0','Customer Status code','Churn Category code'], axis=1)

print(model_df.columns)

Index(['Age', 'Number of Dependents', 'Number of Referrals',
       'Tenure in Months', 'Avg Monthly Long Distance Charges',
       'Avg Monthly GB Download', 'Monthly Charge', 'Total Charges',
       'Total Long Distance Charges', 'Gender code', 'Married code',
       'Offer code', 'Multiple Lines code', 'Internet Service code',
       'Internet Type code', 'Online Security code', 'Online Backup code',
       'Device Protection Plan code', 'Premium Tech Support code',
       'Streaming TV code', 'Streaming Movies code', 'Unlimited Data code',
       'Payment Method code'],
      dtype='object')


# Create a Train Test Split

Use `koi_disposition` for the y values

In [405]:
from sklearn.model_selection import train_test_split
X = pd.get_dummies(model_df["Gender code"], drop_first = True)
# X = model_df
y = df["Customer Status code"].values.reshape(-1, 1)
print(X.shape, y.shape)
print(X)
print(y)

(4601, 1) (4601, 1)
      2
0     0
1     1
2     1
3     1
4     0
...  ..
6582  0
6583  0
6585  0
6586  1
6587  1

[4601 rows x 1 columns]
[[1]
 [1]
 [2]
 ...
 [1]
 [2]
 [1]]


In [395]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

X_train

Unnamed: 0,Age,Number of Dependents,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,Total Charges,Total Long Distance Charges,Gender code,...,Internet Service code,Internet Type code,Online Security code,Online Backup code,Device Protection Plan code,Premium Tech Support code,Streaming TV code,Streaming Movies code,Unlimited Data code,Payment Method code
1385,22,0,0,22,15.77,42.0,69.70,1490.40,346.94,2,...,1,2,1,2,1,2,2,1,1,1
2200,32,0,7,40,24.30,28.0,50.15,2058.50,972.00,2,...,1,3,1,1,1,2,2,1,1,1
948,22,0,0,56,35.59,69.0,73.85,4092.85,1993.04,1,...,1,2,1,1,1,2,2,1,1,2
2149,69,0,0,29,10.55,2.0,79.30,2414.55,305.95,2,...,1,2,1,2,1,1,2,1,1,2
5515,69,0,1,32,33.74,6.0,93.20,2931.00,1079.68,1,...,1,2,1,2,1,2,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6347,37,0,0,13,3.84,4.0,81.15,952.30,49.92,1,...,1,2,1,1,2,2,2,1,1,2
657,49,0,0,26,37.51,30.0,69.05,1815.65,975.26,1,...,1,2,1,2,1,2,2,1,1,1
4429,53,1,5,69,21.56,17.0,79.20,5420.65,1487.64,2,...,1,2,1,2,2,2,2,1,1,1
5403,42,1,1,4,37.23,23.0,69.55,284.90,148.92,2,...,1,2,1,2,1,2,2,1,1,1


In [380]:
print(X_test)

      Age  Number of Dependents  Number of Referrals  Tenure in Months  \
5271   29                     0                    0                60   
6322   53                     0                    0                17   
3699   46                     0                    0                34   
98     22                     0                    0                71   
2644   43                     0                    0                33   
...   ...                   ...                  ...               ...   
4776   64                     2                    6                22   
1861   68                     0                    0                57   
634    30                     0                    2                69   
5406   36                     0                    7                72   
3150   72                     0                    0                 2   

      Avg Monthly Long Distance Charges  Avg Monthly GB Download  \
5271                              37.95    

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [358]:
print (X_train)

      Age  Number of Dependents  Number of Referrals  Tenure in Months  \
1385   22                     0                    0                22   
2200   32                     0                    7                40   
948    22                     0                    0                56   
2149   69                     0                    0                29   
5515   69                     0                    1                32   
...   ...                   ...                  ...               ...   
6347   37                     0                    0                13   
657    49                     0                    0                26   
4429   53                     1                    5                69   
5403   42                     1                    1                 4   
1246   52                     0                    6                72   

      Avg Monthly Long Distance Charges  Avg Monthly GB Download  \
1385                              15.77    

In [359]:
# Scale your data
from sklearn.preprocessing import StandardScaler,LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# # X_train & scale the data
# X_scaler = MinMaxScaler().fit(X_train)
# y_scaler = MinMaxScaler().fit(y_train)
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)


print(len(X_test_scaled),len(encoded_y_test))



1381 1381


  return f(*args, **kwargs)


# Train the Model



In [384]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

  return f(*args, **kwargs)


Training Data Score: 0.779192546583851
Testing Data Score: 0.77697320782042


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [361]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 50],
            'max_iter':[1000, 3000, 10000]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [362]:
# Train the model with GridSearch
grid.fit(X_train_scaled, encoded_y_train)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ................C=1, max_iter=1000;, score=0.784 total time=   0.0s
[CV 2/5] END ................C=1, max_iter=1000;, score=0.814 total time=   0.0s
[CV 3/5] END ................C=1, max_iter=1000;, score=0.792 total time=   0.0s
[CV 4/5] END ................C=1, max_iter=1000;, score=0.825 total time=   0.0s
[CV 5/5] END ................C=1, max_iter=1000;, score=0.815 total time=   0.0s
[CV 1/5] END ................C=1, max_iter=3000;, score=0.784 total time=   0.0s
[CV 2/5] END ................C=1, max_iter=3000;, score=0.814 total time=   0.0s
[CV 3/5] END ................C=1, max_iter=3000;, score=0.792 total time=   0.0s
[CV 4/5] END ................C=1, max_iter=3000;, score=0.825 total time=   0.0s
[CV 5/5] END ................C=1, max_iter=3000;, score=0.815 total time=   0.0s
[CV 1/5] END ...............C=1, max_iter=10000;, score=0.784 total time=   0.0s
[CV 2/5] END ...............C=1, max_iter=10000;

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [1, 5, 10, 50], 'max_iter': [1000, 3000, 10000]},
             verbose=3)

In [363]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'max_iter': 1000}
0.8059006211180124


In [378]:
print(X_train)

      Age  Number of Dependents  Number of Referrals  Tenure in Months  \
1385   22                     0                    0                22   
2200   32                     0                    7                40   
948    22                     0                    0                56   
2149   69                     0                    0                29   
5515   69                     0                    1                32   
...   ...                   ...                  ...               ...   
6347   37                     0                    0                13   
657    49                     0                    0                26   
4429   53                     1                    5                69   
5403   42                     1                    1                 4   
1246   52                     0                    6                72   

      Avg Monthly Long Distance Charges  Avg Monthly GB Download  \
1385                              15.77    

TypeError: Cannot interpret '1' as a data type

In [365]:
# # Visualizing both classes
# scatterplot?

# Save the Model

In [366]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'LogisticRegression.sav'
joblib.dump(model, filename)

['LogisticRegression.sav']