In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, 
                             precision_score, recall_score, 
                             roc_auc_score, classification_report)

import warnings
warnings.filterwarnings('ignore')

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Attrition")

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Attriton/notebooks/mlruns/1', creation_time=1703084781950, experiment_id='1', last_update_time=1703084781950, lifecycle_stage='active', name='Attrition', tags={}>

In [7]:
train_data = pd.read_csv('../processed_data/attrition.csv')

numerical_col = train_data.select_dtypes(exclude=['object']).columns.tolist()
numerical_col.remove('id')
numerical_col.remove('attrition')

categorical_col = train_data.select_dtypes(include=['object']).columns.tolist()

In [8]:
train_df, test_df = train_test_split(train_data, test_size = 0.25, random_state=0)
train_y, test_y = train_df.pop('attrition'), test_df.pop("attrition")

In [9]:
vectorizer = DictVectorizer()

train_dicts = train_df[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_df[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [10]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train, train_y)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        output["C_value"] = c
        print(output)

{'acc': 0.8809523809523809, 'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0, 'aroc': 0.7095135135135135, 'C_value': 0.1}
{'acc': 0.8809523809523809, 'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0, 'aroc': 0.758972972972973, 'C_value': 0.5}
{'acc': 0.8809523809523809, 'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0, 'aroc': 0.7053513513513513, 'C_value': 1}
{'acc': 0.8809523809523809, 'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0, 'aroc': 0.7511351351351351, 'C_value': 3}
{'acc': 0.8809523809523809, 'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0, 'aroc': 0.7087027027027027, 'C_value': 10}


In [11]:
scaler = StandardScaler()
train_df[numerical_col] = scaler.fit_transform(train_df[numerical_col])
test_df[numerical_col] = scaler.transform(test_df[numerical_col])

train_dicts = train_df[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_df[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [12]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("Scaler", "StandardScaler")
        mlflow.log_param("C", c)

        model = LogisticRegression(C = c)
        model.fit(X_train, train_y)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        output["C_value"] = c
        print(output)

{'acc': 0.9, 'f1_score': 0.3823529411764706, 'precision': 0.7222222222222222, 'recall': 0.26, 'aroc': 0.8568108108108108, 'C_value': 0.1}
{'acc': 0.8904761904761904, 'f1_score': 0.36111111111111116, 'precision': 0.5909090909090909, 'recall': 0.26, 'aroc': 0.8571351351351352, 'C_value': 0.5}
{'acc': 0.888095238095238, 'f1_score': 0.3561643835616438, 'precision': 0.5652173913043478, 'recall': 0.26, 'aroc': 0.8574594594594593, 'C_value': 1}
{'acc': 0.888095238095238, 'f1_score': 0.3561643835616438, 'precision': 0.5652173913043478, 'recall': 0.26, 'aroc': 0.8571351351351352, 'C_value': 3}
{'acc': 0.888095238095238, 'f1_score': 0.3561643835616438, 'precision': 0.5652173913043478, 'recall': 0.26, 'aroc': 0.8568648648648648, 'C_value': 10}


In [13]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [14]:
undersample = RandomUnderSampler(sampling_strategy=0.4, random_state= 0)
new_train_data = train_data.copy()
y = new_train_data.pop('attrition')
X_train_new, y = undersample.fit_resample(new_train_data, y)
train_x,  test_x,train_y, test_y = train_test_split(X_train_new, y, 
                                                    test_size = 0.25, random_state=0)

In [15]:
scaler = StandardScaler()
train_x[numerical_col] = scaler.fit_transform(train_x[numerical_col])
test_x[numerical_col] = scaler.transform(test_x[numerical_col])

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')

vectorizer = DictVectorizer()
vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [16]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("scaler", "StandardScaler")
        mlflow.set_tag("sampling", "undersampled")
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train, train_y)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        output["C_value"] = c
        print(output)

{'acc': 0.7828571428571428, 'f1_score': 0.5476190476190476, 'precision': 0.6388888888888888, 'recall': 0.4791666666666667, 'aroc': 0.8307086614173228, 'C_value': 0.1}
{'acc': 0.7828571428571428, 'f1_score': 0.5681818181818181, 'precision': 0.625, 'recall': 0.5208333333333334, 'aroc': 0.8259514435695539, 'C_value': 0.5}
{'acc': 0.7771428571428571, 'f1_score': 0.5617977528089888, 'precision': 0.6097560975609756, 'recall': 0.5208333333333334, 'aroc': 0.8231627296587927, 'C_value': 1}
{'acc': 0.7828571428571428, 'f1_score': 0.5681818181818181, 'precision': 0.625, 'recall': 0.5208333333333334, 'aroc': 0.8179133858267716, 'C_value': 3}
{'acc': 0.7828571428571428, 'f1_score': 0.5681818181818181, 'precision': 0.625, 'recall': 0.5208333333333334, 'aroc': 0.8154527559055118, 'C_value': 10}


In [17]:
print(classification_report(test_y, prediction))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85       127
           1       0.62      0.52      0.57        48

    accuracy                           0.78       175
   macro avg       0.73      0.70      0.71       175
weighted avg       0.77      0.78      0.78       175



In [18]:
oversample = SMOTE()
new_train_data = train_data.copy()
y = new_train_data.pop('attrition')
train_x,  test_x,train_y, test_y = train_test_split(new_train_data, y, test_size = 0.25, random_state=0)

In [19]:
vectorizer = DictVectorizer()

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')
vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)
X_train_new, train_y_new = oversample.fit_resample(X_train, train_y)

In [20]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("sampling", "oversampled")
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train_new, train_y_new)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        output["C_value"] = c
        print(output)

{'acc': 0.6761904761904762, 'f1_score': 0.33980582524271846, 'precision': 0.22435897435897437, 'recall': 0.7, 'aroc': 0.7465405405405406, 'C_value': 0.1}
{'acc': 0.6547619047619048, 'f1_score': 0.30622009569377995, 'precision': 0.20125786163522014, 'recall': 0.64, 'aroc': 0.7128108108108108, 'C_value': 0.5}
{'acc': 0.65, 'f1_score': 0.30331753554502366, 'precision': 0.19875776397515527, 'recall': 0.64, 'aroc': 0.701081081081081, 'C_value': 1}
{'acc': 0.65, 'f1_score': 0.30331753554502366, 'precision': 0.19875776397515527, 'recall': 0.64, 'aroc': 0.7185945945945946, 'C_value': 3}
{'acc': 0.6166666666666667, 'f1_score': 0.296943231441048, 'precision': 0.18994413407821228, 'recall': 0.68, 'aroc': 0.6805945945945945, 'C_value': 10}


In [21]:
scaler = StandardScaler()
train_x[numerical_col] = scaler.fit_transform(train_x[numerical_col])
test_x[numerical_col] = scaler.transform(test_x[numerical_col])

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

X_train_new, train_y_new = oversample.fit_resample(X_train, train_y)

In [22]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag('scaler', 'StandardScaler')
        mlflow.set_tag("sampling", "oversampled")
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train_new, train_y_new)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        output["C_value"] = c
        print(output)

{'acc': 0.7833333333333333, 'f1_score': 0.46153846153846156, 'precision': 0.3277310924369748, 'recall': 0.78, 'aroc': 0.8525945945945946, 'C_value': 0.1}
{'acc': 0.7928571428571428, 'f1_score': 0.4790419161676647, 'precision': 0.3418803418803419, 'recall': 0.8, 'aroc': 0.8528648648648649, 'C_value': 0.5}
{'acc': 0.7904761904761904, 'f1_score': 0.47619047619047616, 'precision': 0.3389830508474576, 'recall': 0.8, 'aroc': 0.8527027027027027, 'C_value': 1}
{'acc': 0.7904761904761904, 'f1_score': 0.47619047619047616, 'precision': 0.3389830508474576, 'recall': 0.8, 'aroc': 0.8533513513513513, 'C_value': 3}
{'acc': 0.7904761904761904, 'f1_score': 0.47619047619047616, 'precision': 0.3389830508474576, 'recall': 0.8, 'aroc': 0.8535675675675676, 'C_value': 10}


In [23]:
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

test_data['newage'] = pd.cut(x = test_data['Age'], bins = [17, 30, 42, 61 ], labels = ['18 - 30', '31 - 42', '43 - 60'])
test_data['oldyoung'] = pd.cut(x = test_data['Age'], bins = [17, 30, 61], labels = ['young', 'old'])
test_data['loyal'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 42], labels = ['fairly', 'loyal'])

test_data['masterylevel'] = pd.cut(x = test_data['TotalWorkingYears'], bins = [-1, 3, 10, 421], labels = ['entry', 'intermediate', 'master'])
test_data['loyaltylevel'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 10, 42], labels = ['fairly', 'loyal', 'very-loyal'])
test_data['dueforprom'] = pd.cut(x = test_data['YearsSinceLastPromotion'], bins = [-1, 5,  16], labels = ['due', 'overdue'])

In [24]:
test_data[numerical_col] = scaler.transform(test_data[numerical_col])
test_dat  = test_data[categorical_col + numerical_col].to_dict(orient='record')
X_test = vectorizer.transform(test_dat)

prediction = model.predict_proba(X_test)[:,1]
dicts = {'id': test_data['id'], 'Attrition': prediction}
output_frame = pd.DataFrame(dicts)
output_frame.to_csv('../submissions/lnreg001.csv', index = False)

292