In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, 
                             precision_score, recall_score, 
                             roc_auc_score, classification_report)

import warnings
warnings.filterwarnings('ignore')

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Attrition")

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Attriton/notebooks/mlruns/1', creation_time=1701070848011, experiment_id='1', last_update_time=1701070848011, lifecycle_stage='active', name='Attrition', tags={}>

In [3]:
train_data = pd.read_csv('../data/newtrain1.csv')
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

numerical_col = train_data.select_dtypes(exclude=['object']).columns.tolist()
numerical_col.remove('id')
numerical_col.remove( 'Attrition')
numerical_col.remove('EmployeeCount')
numerical_col.remove('StandardHours')

categorical_col = train_data.select_dtypes(include=['object']).columns.tolist()
categorical_col.remove('Over18')


train_data = train_data[train_data['TrainingTimesLastYear'] <= 4]
train_data = train_data[train_data['TrainingTimesLastYear'] > 0]
train_data = train_data[train_data['YearsSinceLastPromotion'] <= 5]
train_data = train_data[train_data['YearsWithCurrManager'] <= 13]

In [4]:
categorical_col = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus', 
                   'OverTime','newage', 'masterylevel', 'loyaltylevel', 'oldyoung', 'loyal']
numerical_col = ['DailyRate', 'DistanceFromHome',  'Education',  'EnvironmentSatisfaction',
            'HourlyRate', 'JobInvolvement', 'JobSatisfaction',  'MonthlyIncome',  'NumCompaniesWorked', 
            'RelationshipSatisfaction',  'StockOptionLevel',  'TrainingTimesLastYear',  'WorkLifeBalance',]

In [5]:
train_df, test_df = train_test_split(train_data, test_size = 0.25, random_state=0)
train_y, test_y = train_df.pop('Attrition'), test_df.pop("Attrition")

In [6]:
vectorizer = DictVectorizer()

train_dicts = train_df[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_df[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [7]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("data", "Full")
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train, train_y)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        print(c)
        print(output)
        print('\n')

0.1
{'acc': 0.8562300319488818, 'f1_score': 0.11764705882352941, 'precision': 0.25, 'recall': 0.07692307692307693, 'aroc': 0.7689500280741157}


0.5
{'acc': 0.865814696485623, 'f1_score': 0.3225806451612903, 'precision': 0.43478260869565216, 'recall': 0.2564102564102564, 'aroc': 0.7664233576642336}


1
{'acc': 0.8626198083067093, 'f1_score': 0.0851063829787234, 'precision': 0.25, 'recall': 0.05128205128205128, 'aroc': 0.7518248175182483}


3
{'acc': 0.8690095846645367, 'f1_score': 0.2545454545454545, 'precision': 0.4375, 'recall': 0.1794871794871795, 'aroc': 0.7698858319296275}


10
{'acc': 0.865814696485623, 'f1_score': 0.3, 'precision': 0.42857142857142855, 'recall': 0.23076923076923078, 'aroc': 0.7670784203630918}




In [8]:
scaler = StandardScaler()
train_df[numerical_col] = scaler.fit_transform(train_df[numerical_col])
test_df[numerical_col] = scaler.transform(test_df[numerical_col])

train_dicts = train_df[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_df[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [9]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("data", "engineered")
        mlflow.set_tag("Scaler", "StandardScaler")
        mlflow.log_param("C", c)

        model = LogisticRegression(C = c)
        model.fit(X_train, train_y)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        print(c)
        print(output)
        print('\n\n')

0.1
{'acc': 0.8690095846645367, 'f1_score': 0.3050847457627119, 'precision': 0.45, 'recall': 0.23076923076923078, 'aroc': 0.8072244057645518}



0.5
{'acc': 0.8626198083067093, 'f1_score': 0.37681159420289856, 'precision': 0.43333333333333335, 'recall': 0.3333333333333333, 'aroc': 0.8153658993075051}



1
{'acc': 0.8626198083067093, 'f1_score': 0.37681159420289856, 'precision': 0.43333333333333335, 'recall': 0.3333333333333333, 'aroc': 0.8185476324162455}



3
{'acc': 0.8690095846645367, 'f1_score': 0.4225352112676056, 'precision': 0.46875, 'recall': 0.38461538461538464, 'aroc': 0.8210743028261276}



10
{'acc': 0.8690095846645367, 'f1_score': 0.4225352112676056, 'precision': 0.46875, 'recall': 0.38461538461538464, 'aroc': 0.822758749766049}





In [10]:
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

test_data['newage'] = pd.cut(x = test_data['Age'], bins = [17, 30, 42, 61 ], labels = ['18 - 30', '31 - 42', '43 - 60'])


test_data['oldyoung'] = pd.cut(x = test_data['Age'], bins = [17, 30, 61], labels = ['young', 'old'])
test_data['loyal'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 42], labels = ['fairly', 'loyal'])


test_data['masterylevel'] = pd.cut(x = test_data['TotalWorkingYears'], bins = [-1, 3, 10, 421], labels = ['entry', 'intermediate', 'master'])
test_data['loyaltylevel'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 10, 42], labels = ['fairly', 'loyal', 'very-loyal'])
test_data['dueforprom'] = pd.cut(x = test_data['YearsSinceLastPromotion'], bins = [-1, 5,  16], labels = ['due', 'overdue'])

test_dat  = test_data[categorical_col + numerical_col].to_dict(orient='record')
X_test = vectorizer.transform(test_dat)

prediction = model.predict_proba(X_test)[:,1]
dicts = {'id': test_data['id'], 'Attrition': prediction}
output_frame = pd.DataFrame(dicts)

(output_frame['Attrition'] >=0.5).astype('int').sum()

1119

In [11]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [12]:
undersample = RandomUnderSampler(sampling_strategy=0.4, random_state= 0)
new_train_data = train_data.copy()
y = new_train_data.pop('Attrition')
X_train_new, y = undersample.fit_resample(new_train_data, y)
train_x,  test_x,train_y, test_y = train_test_split(X_train_new, y, 
                                                    test_size = 0.25, random_state=0)

In [13]:
scaler = StandardScaler()
train_x[numerical_col] = scaler.fit_transform(train_x[numerical_col])
test_x[numerical_col] = scaler.transform(test_x[numerical_col])

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')

vectorizer = DictVectorizer()
vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [14]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("data", "engineered")
        mlflow.set_tag("scaler", "StandardScaler")
        mlflow.set_tag("sampling", "undersampled")
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train, train_y)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        print(c)
        print(output)
        print('\n\n')

0.1
{'acc': 0.803030303030303, 'f1_score': 0.59375, 'precision': 0.8636363636363636, 'recall': 0.4523809523809524, 'aroc': 0.8465608465608465}



0.5
{'acc': 0.8181818181818182, 'f1_score': 0.6363636363636364, 'precision': 0.875, 'recall': 0.5, 'aroc': 0.8425925925925926}



1
{'acc': 0.803030303030303, 'f1_score': 0.6060606060606061, 'precision': 0.8333333333333334, 'recall': 0.47619047619047616, 'aroc': 0.8338624338624337}



3
{'acc': 0.8257575757575758, 'f1_score': 0.6666666666666667, 'precision': 0.8518518518518519, 'recall': 0.5476190476190477, 'aroc': 0.8224867724867726}



10
{'acc': 0.7954545454545454, 'f1_score': 0.6301369863013699, 'precision': 0.7419354838709677, 'recall': 0.5476190476190477, 'aroc': 0.8052910052910053}





In [15]:
print(classification_report(test_y, prediction))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86        90
           1       0.74      0.55      0.63        42

    accuracy                           0.80       132
   macro avg       0.78      0.73      0.74       132
weighted avg       0.79      0.80      0.79       132



In [16]:
vectorizer = DictVectorizer()

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [17]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("data", "engineered")
        mlflow.set_tag("scaler", "None")
        mlflow.set_tag("sampling", "undersampled")
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train, train_y)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        print(c)
        print(output)
        print('\n\n')

0.1
{'acc': 0.803030303030303, 'f1_score': 0.59375, 'precision': 0.8636363636363636, 'recall': 0.4523809523809524, 'aroc': 0.8465608465608465}



0.5
{'acc': 0.8181818181818182, 'f1_score': 0.6363636363636364, 'precision': 0.875, 'recall': 0.5, 'aroc': 0.8425925925925926}



1
{'acc': 0.803030303030303, 'f1_score': 0.6060606060606061, 'precision': 0.8333333333333334, 'recall': 0.47619047619047616, 'aroc': 0.8338624338624337}



3
{'acc': 0.8257575757575758, 'f1_score': 0.6666666666666667, 'precision': 0.8518518518518519, 'recall': 0.5476190476190477, 'aroc': 0.8224867724867726}



10
{'acc': 0.7954545454545454, 'f1_score': 0.6301369863013699, 'precision': 0.7419354838709677, 'recall': 0.5476190476190477, 'aroc': 0.8052910052910053}





In [18]:
oversample = SMOTE()
new_train_data = train_data.copy()
y = new_train_data.pop('Attrition')
train_x,  test_x,train_y, test_y = train_test_split(new_train_data, y, test_size = 0.25, random_state=0)

In [19]:
vectorizer = DictVectorizer()

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')
vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)
X_train_new, train_y_new = oversample.fit_resample(X_train, train_y)

In [20]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("data", "engineered")
        mlflow.set_tag("sampling", "oversampled")
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train_new, train_y_new)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        print(c)
        print(output)
        print('\n\n')

0.1
{'acc': 0.7188498402555911, 'f1_score': 0.38028169014084506, 'precision': 0.2621359223300971, 'recall': 0.6923076923076923, 'aroc': 0.759685569904548}



0.5
{'acc': 0.7188498402555911, 'f1_score': 0.3623188405797102, 'precision': 0.25252525252525254, 'recall': 0.6410256410256411, 'aroc': 0.7575332210368707}



1
{'acc': 0.7188498402555911, 'f1_score': 0.37142857142857144, 'precision': 0.25742574257425743, 'recall': 0.6666666666666666, 'aroc': 0.7579075425790754}



3
{'acc': 0.7092651757188498, 'f1_score': 0.35460992907801414, 'precision': 0.24509803921568626, 'recall': 0.6410256410256411, 'aroc': 0.7583754445068314}



10
{'acc': 0.7252396166134185, 'f1_score': 0.38571428571428573, 'precision': 0.26732673267326734, 'recall': 0.6923076923076923, 'aroc': 0.763148044169942}





In [21]:
scaler = StandardScaler()
train_x[numerical_col] = scaler.fit_transform(train_x[numerical_col])
test_x[numerical_col] = scaler.transform(test_x[numerical_col])

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

X_train_new, train_y_new = oversample.fit_resample(X_train, train_y)

In [22]:
for c in(0.1, 0.5, 1, 3, 10):
    with mlflow.start_run():
        mlflow.set_tag('model', 'Logisticreg')
        mlflow.set_tag("data", "engineered")
        mlflow.set_tag('scaler', 'StandardScaler')
        mlflow.set_tag("sampling", "oversampled")
        mlflow.log_param("C", c)
        model = LogisticRegression(C = c)
        model.fit(X_train_new, train_y_new)
        prediction0 = model.predict_proba(X_val)[:,1]
        prediction = model.predict(X_val)
        output = {"acc":accuracy_score(test_y, prediction), 
                "f1_score":f1_score(test_y, prediction), 
                "precision":precision_score(test_y, prediction), 
                "recall":recall_score(test_y, prediction),
                "aroc": roc_auc_score(test_y, prediction0)}
        mlflow.log_metrics(output)
        print(c)
        print(output)
        print('\n\n')

0.1
{'acc': 0.7252396166134185, 'f1_score': 0.41891891891891897, 'precision': 0.28440366972477066, 'recall': 0.7948717948717948, 'aroc': 0.8139621935242373}



0.5
{'acc': 0.744408945686901, 'f1_score': 0.43661971830985913, 'precision': 0.30097087378640774, 'recall': 0.7948717948717948, 'aroc': 0.820699981283923}



1
{'acc': 0.7476038338658147, 'f1_score': 0.4397163120567376, 'precision': 0.30392156862745096, 'recall': 0.7948717948717948, 'aroc': 0.822197267452742}



3
{'acc': 0.7476038338658147, 'f1_score': 0.4397163120567376, 'precision': 0.30392156862745096, 'recall': 0.7948717948717948, 'aroc': 0.8228523301516002}



10
{'acc': 0.744408945686901, 'f1_score': 0.43661971830985913, 'precision': 0.30097087378640774, 'recall': 0.7948717948717948, 'aroc': 0.8228523301516002}





In [23]:
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

test_data['newage'] = pd.cut(x = test_data['Age'], bins = [17, 30, 42, 61 ], labels = ['18 - 30', '31 - 42', '43 - 60'])
test_data['oldyoung'] = pd.cut(x = test_data['Age'], bins = [17, 30, 61], labels = ['young', 'old'])
test_data['loyal'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 42], labels = ['fairly', 'loyal'])

test_data['masterylevel'] = pd.cut(x = test_data['TotalWorkingYears'], bins = [-1, 3, 10, 421], labels = ['entry', 'intermediate', 'master'])
test_data['loyaltylevel'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 10, 42], labels = ['fairly', 'loyal', 'very-loyal'])
test_data['dueforprom'] = pd.cut(x = test_data['YearsSinceLastPromotion'], bins = [-1, 5,  16], labels = ['due', 'overdue'])

In [24]:
test_data[numerical_col] = scaler.transform(test_data[numerical_col])
test_dat  = test_data[categorical_col + numerical_col].to_dict(orient='record')
X_test = vectorizer.transform(test_dat)

prediction = model.predict_proba(X_test)[:,1]
dicts = {'id': test_data['id'], 'Attrition': prediction}
output_frame = pd.DataFrame(dicts)

(output_frame['Attrition'] >=0.5).astype('int').sum()

292

In [25]:
output_frame.to_csv('../submissions/lnreg001.csv', index = False)