In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
ls "/content/drive/My Drive/GDL - Cohort 18 Collaboration Drive/Project Files/C18 - 38 Supervised attrition models/"

Employee_Attrition_predition_using_SVM.ipynb  New_Attrition_data.csv
naive_bayes_model.ipynb                       TTSS_Regression.ipynb
New_Attrition_Data_Cleaning.ipynb


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

In [4]:
#Loading the TTSS dataset
df = pd.read_csv("/content/drive/My Drive/GDL - Cohort 18 Collaboration Drive/Project Files/C18 - 38 Supervised attrition models/New_Attrition_data.csv")
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,ActivityDate,ID,ActivityCategoryName,StartTimeInLocal,EndTimeInLocal,FileOrUrlName,TimeSpent,isCore,ActivityTime,Description,Effective Date,JobFamilyName,DepartmentName,TeamName,DoW,Days to termination
0,2021-08-05,14906,Other,2021-08-05 08:57:48,2021-08-05 08:58:03,https://passportprd.tenent39-dns.com/ecomes.zw...,0.25,1,0.004167,0,2021-08-17,Domestic Mandatory Corporate Actions,TTSS,909,3,12
1,2021-07-20,14906,Other,2021-07-20 15:10:48,2021-07-20 15:11:03,microsoft teams,0.25,1,0.004167,0,2021-08-17,Domestic Mandatory Corporate Actions,TTSS,909,1,28
2,2021-09-01,14975,Other,2021-09-01 10:40:50,2021-09-01 10:41:05,excel,0.25,1,0.004167,0,2021-12-03,Domestic Trade Services,TTSS,1168,2,93
3,2022-01-28,14932,Other,2022-01-28 10:00:11,2022-01-28 10:00:26,microsoft teams,0.25,1,0.004167,0,2022-08-05,Income - DTC Debt,TTSS,1177,4,189
4,2021-11-30,14975,Other,2021-11-30 10:26:22,2021-11-30 10:26:52,message (html),0.5,0,0.008333,0,2021-12-03,Domestic Trade Services,TTSS,1168,1,3


In [5]:
#Check the columns
df.columns

Index(['ActivityDate', 'ID', 'ActivityCategoryName', 'StartTimeInLocal',
       'EndTimeInLocal', 'FileOrUrlName', 'TimeSpent', 'isCore',
       'ActivityTime', 'Description', 'Effective Date', 'JobFamilyName',
       'DepartmentName', 'TeamName', 'DoW', 'Days to termination'],
      dtype='object')

In [6]:
#check for null values
df.isna().sum()

ActivityDate                  0
ID                            0
ActivityCategoryName          0
StartTimeInLocal        3549804
EndTimeInLocal          3549804
FileOrUrlName           4282907
TimeSpent               3549804
isCore                        0
ActivityTime                  0
Description                   0
Effective Date          3549804
JobFamilyName                 0
DepartmentName                0
TeamName                      0
DoW                           0
Days to termination           0
dtype: int64

In [7]:
#check for the shape and data types
print(df.shape)
print(df.dtypes)

(11526472, 16)
ActivityDate             object
ID                        int64
ActivityCategoryName     object
StartTimeInLocal         object
EndTimeInLocal           object
FileOrUrlName            object
TimeSpent               float64
isCore                    int64
ActivityTime            float64
Description               int64
Effective Date           object
JobFamilyName            object
DepartmentName           object
TeamName                  int64
DoW                       int64
Days to termination       int64
dtype: object


In [8]:
#Will drop columns with null value and don't have any significance 
df.drop(["StartTimeInLocal", "TimeSpent","Effective Date","EndTimeInLocal", "FileOrUrlName"], axis=1, inplace=True)
df.head(10)

Unnamed: 0,ActivityDate,ID,ActivityCategoryName,isCore,ActivityTime,Description,JobFamilyName,DepartmentName,TeamName,DoW,Days to termination
0,2021-08-05,14906,Other,1,0.004167,0,Domestic Mandatory Corporate Actions,TTSS,909,3,12
1,2021-07-20,14906,Other,1,0.004167,0,Domestic Mandatory Corporate Actions,TTSS,909,1,28
2,2021-09-01,14975,Other,1,0.004167,0,Domestic Trade Services,TTSS,1168,2,93
3,2022-01-28,14932,Other,1,0.004167,0,Income - DTC Debt,TTSS,1177,4,189
4,2021-11-30,14975,Other,0,0.008333,0,Domestic Trade Services,TTSS,1168,1,3
5,2021-08-18,14932,Other,1,0.0125,0,Income - DTC Debt,TTSS,1177,2,352
6,2021-11-24,14975,Other,1,0.041667,0,Domestic Trade Services,TTSS,1168,2,9
7,2021-08-04,14906,Other,1,0.004167,0,Domestic Mandatory Corporate Actions,TTSS,909,2,13
8,2021-10-01,14932,Other,1,0.004167,0,Income - DTC Debt,TTSS,1177,4,308
9,2021-08-23,14975,Other,1,0.004167,0,Domestic Trade Services,TTSS,1168,0,102


In [9]:
df["Description"].value_counts()

0    7976668
1    3549804
Name: Description, dtype: int64

In [10]:
target_col = "Description"

In [11]:
def handle_features(df, target_col):
    #separating numerical and categorical columns
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    cat_cols.remove('ActivityDate')  # remove 'ActivityDate' from list of categorical columns

    #handling missing values in numerical columns
    num_cols_with_na = [col for col in num_cols if df[col].isnull().sum() > 0]
    for col in num_cols_with_na:
        df[col].fillna(df[col].mean(), inplace=True)

    #handling missing values in categorical columns
    cat_cols_with_na = [col for col in cat_cols if df[col].isnull().sum() > 0]
    for col in cat_cols_with_na:
        df[col].fillna(df[col].mode()[0], inplace=True)

    #transform 'ActivityDate' column into datetime
    df['ActivityDate'] = pd.to_datetime(df['ActivityDate'])

    #encoding categorical columns
    le = LabelEncoder()
    for col in cat_cols:
        df[col] = le.fit_transform(df[col])

    #setting date as the index
    df.set_index('ActivityDate', inplace=True)

    #separating target column and features
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    return X, y

In [12]:
#Testing the output
X, y = handle_features(df, target_col)
print(X.head(5))
print(y.head(5))

                 ID  ActivityCategoryName  isCore  ActivityTime  \
ActivityDate                                                      
2021-08-05    14906                     9       1      0.004167   
2021-07-20    14906                     9       1      0.004167   
2021-09-01    14975                     9       1      0.004167   
2022-01-28    14932                     9       1      0.004167   
2021-11-30    14975                     9       0      0.008333   

              JobFamilyName  DepartmentName  TeamName  DoW  \
ActivityDate                                                 
2021-08-05               48               9       909    3   
2021-07-20               48               9       909    1   
2021-09-01               50               9      1168    2   
2022-01-28               87               9      1177    4   
2021-11-30               50               9      1168    1   

              Days to termination  
ActivityDate                       
2021-08-05             

In [13]:
np.unique(y)

array([0, 1])

In [17]:
def create_base_model(X, y, model_type):
  #Splitting the dataset
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

   

  if model_type == 'Random Forest':

      #oversample the training data using RandomOverSampler
      oversampler = RandomOverSampler()
      X_train, y_train = oversampler.fit_resample(X_train, y_train)

      # create random forest model and fit to training data
      model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=10) #Try to regularize to avoid overfitting
      model.fit(X_train, y_train)

      # predict on test data and evaluate performance
      y_pred = model.predict(X_test)
      accuracy = accuracy_score(y_test, y_pred)
      roc_auc = roc_auc_score(y_test, y_pred)
      print('Random Forest Model Performance:')
      print('Accuracy:', accuracy)
      print('ROC AUC:', roc_auc)

  elif model_type == 'LSTM':
      # reshape data for LSTM model
      #X_train = np.reshape(np.array(X_train), (X_train.shape[0], 1, X_train.shape[1]))
      #X_test = np.reshape(np.array(X_test), (X_test.shape[0], 1, X_test.shape[1]))
      X_train = np.reshape(np.array(X_train), (X_train.shape[0], 1, X_train.shape[1]))
      X_test = np.reshape(np.array(X_test), (X_test.shape[0], 1, X_test.shape[1]))

      #calculate class frequencies
      class_weights = {0: 1./(np.sum(y_train==0)), 1: 1./(np.sum(y_train==1))}

      # create LSTM model with 5 layers
      from tensorflow.keras import regularizers
      model = Sequential()
      model.add(LSTM(50, input_shape=(1, 10), return_sequences=True, kernel_regularizer=regularizers.l2(0.01)))
      model.add(Dropout(0.4))
      model.add(LSTM(50, return_sequences=True, kernel_regularizer=regularizers.l2(0.01)))
      model.add(Dropout(0.4))
      model.add(LSTM(50))
      model.add(Dropout(0.4))
      model.add(Dense(1, activation='sigmoid'))
      model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

      #create sample weights
      sample_weights = np.array([class_weights[y] for y in y_train])
      sample_weights = sample_weights / np.sum(sample_weights)

      # fit model to training data
      #model.fit(X_train, y_train, epochs=10, batch_size=32)
      model.fit(X_train, y_train, epochs=10, batch_size=32, sample_weight=sample_weights)

      # predict on test data and evaluate performance
      y_pred = model.predict(X_test)
      y_pred = [1 if y>=0.5 else 0 for y in y_pred]
      accuracy = accuracy_score(y_test, y_pred)
      roc_auc = roc_auc_score(y_test, y_pred)
      print('LSTM Model Performance:')
      print('Accuracy:', accuracy)
      print('ROC AUC:', roc_auc) 
  else:
      raise ValueError('Invalid model type')
      
  return model, accuracy, roc_auc#, y_test, y_pred

In [15]:
#Testing the Random Forest Model
model, accuracy, roc_auc = create_base_model(X, y, 'Random Forest')


Random Forest Model Performance:
Accuracy: 1.0
ROC AUC: 1.0


In [None]:
#Testing the LSTM
model, accuracy, roc_auc = create_base_model(X, y, 'LSTM')


In [None]:
def plot_results(accuracy, roc_auc):

  fig, ax = plt.subplots(1, 2, figsize=(12, 4))
  ax[0].plot(accuracy, marker='o')
  ax[0].set_xlabel('Model')
  ax[0].set_ylabel('Accuracy')
  ax[1].plot(roc_auc, marker='o')
  ax[1].set_xlabel('Model')
  ax[1].set_ylabel('ROC AUC')
  plt.show()

In [None]:
rf_model = create_base_model(X, y, 'Random Forest')
lstm_model = create_base_model(X, y, 'LSTM')

In [None]:
#plot results
x = ['Random Forest', 'LSTM']
plot_results(accuracy, roc_auc)
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].bar(x, accuracy)
axs[0].set_title('Accuracy')
axs[1].bar(x, roc_auc)
axs[1].set_title('ROC AUC')
plt.show()

In [None]:
def plot_roc_curve(y_test, y_pred):

  fpr, tpr, thresholds = roc_curve(y_test, y_pred)
  plt.plot(fpr, tpr)
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('ROC Curve')
  plt.show()
plot_roc_curve(y_test, y_pred)