# Modeling


## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
# import wandb
pd.set_option("display.max_columns", None)

In [2]:
# wandb.init(project="ml-classification-regression")

In [3]:
df = pd.read_csv('../data\processed\HRDataset_p_v3_selected_features.csv', index_col=0)
df.head()

  df = pd.read_csv('../data\processed\HRDataset_p_v3_selected_features.csv', index_col=0)


Unnamed: 0_level_0,GenderID,PerfScoreID,FromDiversityJobFairID,Salary,State,MaritalDesc,CitizenDesc,HispanicLatino,RaceDesc,Department,ManagerName,RecruitmentSource,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,DaysLateLast30,Absences,Managerial,Age,HireYear,HireMonth,NumberOfColleagues
EmpID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10026,1,4,0,62506,ma,single,us citizen,no,white,production,michael albert,linkedin,4.6,5,0,0,1,0,36.0,2011,7,21
10084,1,3,0,104437,ma,married,us citizen,no,white,it/is,simon roup,indeed,4.96,3,6,0,17,0,41.0,2015,3,16
10196,0,3,0,64955,ma,married,us citizen,no,white,production,kissy sullivan,linkedin,3.02,3,0,0,3,0,23.0,2011,7,21
10088,0,3,0,64991,ma,married,us citizen,no,white,production,elijiah gray,indeed,4.84,5,0,0,15,0,30.0,2008,1,21
10069,0,3,0,50825,ma,divorced,us citizen,no,white,production,webster butler,google search,5.0,4,0,0,2,0,26.0,2011,7,20


In [4]:
categorical_features = [
    'GenderID',
    'FromDiversityJobFairID',
    'State',
    'MaritalDesc',
    'CitizenDesc',
    'HispanicLatino',
    'RaceDesc',
    'Department',
    'ManagerName',
    'RecruitmentSource',
    'Managerial',
    'HireYear',
    'HireMonth'
]
numeric_features = [
    'Salary',
    'EngagementSurvey',
    'EmpSatisfaction',
    'SpecialProjectsCount',
    'DaysLateLast30',
    'Absences',
    'Age',
    'NumberOfColleagues'
]
label = 'PerfScoreID'

In [5]:
df.shape

(311, 22)

### Split the Data

In [6]:
# Separate features and labels
X = df.drop(columns=[label])
y = df[label]

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Encode and Scale 

In [7]:
# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # we don't have missing numerical values
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical_features)
    ])

## Classification
Training examples

In [11]:
# Define pipelines for classification
classifiers = {
    'RandomForestClassifier': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ]),
    'LogisticRegression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ]),
    'GradientBoostingClassifier': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ]),
    'SVC': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', SVC(random_state=42))
    ])
}


In [None]:
for name, model in classifiers.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # wandb.log({
    #     f"{name}/accuracy": np.mean(y_pred == y_test),
    #     f"{name}/classification_report": classification_report(y_test, y_pred, output_dict=True)
    # })
    
    print(f"{name} Accuracy: {np.mean(y_pred == y_test)}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred))

## Regression
We can test regression algorithms because our class labels reflect an inherent numerical progression (1, 2, 3, 4).

In [13]:
regressors = {
    'RandomForestRegressor': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ]),
    'LinearRegression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'GradientBoostingRegressor': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
    ]),
    'SVR': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR())
    ])
}

In [None]:
for name, model in regressors.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # wandb.log({
    #     f"{name}/mse": mean_squared_error(y_test, y_pred),
    #     f"{name}/predictions": wandb.Histogram(y_pred)
    # })

    # floor predictions to compare with classification
    y_pred_floor = np.floor(y_pred)
    # wandb.log({
    #     f"{name}/accuracy": np.mean(y_pred_floor == y_test),
    #     f"{name}/classification_report": classification_report(y_test, y_pred_floor, output_dict=True)
    # })

    print(f"{name} Mean Squared Error:")
    print(mean_squared_error(y_test, y_pred))

    print(f"{name} Accuracy: {np.mean(y_pred_floor == y_test)}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred_floor))

In [None]:
# wandb.finish()