In [1]:
# !pip install imblearn

import os

import pandas as pd
import numpy as np

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# scikit-learn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

# Function for creating model pipelines - sklearn
from sklearn.pipeline import make_pipeline

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE


ModuleNotFoundError: No module named 'imblearn'

In [None]:
abt = pd.read_csv("../Resources/analytical_base_table.csv")
abt.head()

## Models Training

## Let's start by splitting our dataframe into separate objects:

y for the target varibale

X for the input features

In [None]:
#Separate dataframe into separate object

# Object for target variable
y = abt.Exited

# object for input features
X = abt.drop(['Exited'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)


In [None]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

In [None]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

In [None]:
def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    abt2 = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    abt2['Count'] = abt2['Count'].astype('int64')
    abt2['%'] = round(abt2['Count'] / a.shape[0] * 100, 2)
    return abt2.sort_values('Count',ascending=False)

In [None]:
class_count(y)


## Create a Train Test Split

We will continue with splitting our data into separate training and test sets.

30% of observations will be set aside for the test set

the rest, 70%, will be used as the training set

In [None]:
random_state = 10

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=random_state,
                                                    stratify=abt.Exited)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
X_train.info()

## Pre-processing Pipeline
 
 ##Scale numerical data and encode categorical data
Construct a pre-processing pipeline from the given transformers: MinMaxScaler and OneHotEncoder Create lists of indexes from the list of column namesNeed to be numeric not string to specify columns name in column transformer

In [None]:
num_features = [] 

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)  

In [None]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features) 

In [None]:
# Define column transformer
# Need to be numeric not string to specify columns name 
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(sparse=False), cat_features)
)
preprocess

In [None]:
# Import classifier
from sklearn.linear_model import LogisticRegression 

# Define model with pipeline
model = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=random_state),
                  LogisticRegression(random_state=random_state))

model

In [None]:
 # Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
lr_param_grid = {
    'logisticregression__C' : [0.01, 0.05, 0.1, 0.5, 1, 5],
    'logisticregression__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
}

lr_grid = GridSearchCV(model, lr_param_grid, verbose=3, cv= 5, scoring='accuracy')

In [None]:
X_train = X_train.values
X_test = X_test.values

In [None]:
lr_grid.fit(X_train, y_train)

In [None]:
print(lr_grid.best_params_)

In [None]:
print(lr_grid.best_score_)

In [None]:
print(f"Training Data Score: {lr_grid.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_grid.score(X_test, y_test)}")

In [None]:
predictions = lr_grid.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

In [None]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
cm = confusion_matrix(y_test, predictions)
print(cm)

In [None]:
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
pred = lr_grid.predict(X_test[:1])

In [None]:
print(f"Predicted classes: {pred}")
print(f"Actual Labels: {list(y_test[:1])}")

In [None]:
import joblib

filename = '../models/nate_logistic_regression.sav'
joblib.dump(lr_grid, filename)

In [None]:
lr_model = joblib.load(filename)
print(lr_model.score(X_test, y_test))

### Predict class for new data

In [None]:
# Let's use the first X_test record as new data
X_test[:1]

In [None]:
pred_new = lr_grid.predict(X_test[:1])

In [None]:
print(f"Predicted classes: {pred_new}")
print(f"Actual Labels: {list(y_test[:1])}")

In [None]:
X_test[:1].values

In [None]:
pred_new1 = lr_grid.predict(X_test[:1])
pred_new1