In [None]:
#Import the libraries

# Basic libraries
import pymysql
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from imblearn.combine import SMOTETomek


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



# Other
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")

## 1. The Goal

The goal of this project is to predict whether an employee is going to leave a company or not. 
Various factors will be taken into consideration, such as performance scores, satisfaction scores etc.

## 2. Getting the data

In [None]:
data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
data

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe().round(1)

In [None]:
#change the names of the columns

data = data.rename(columns={'Age': 'age',
                       'Attrition': 'attrition',
                       'BusinessTravel': 'business_travel',
                       'DailyRate': 'daily_travel',
                       'Department': 'department',
                       'DistanceFromHome': 'distance_from_home',
                       'Education': 'education',
                       'EducationField': 'education_field',
                       'EmployeeCount': 'employee_count',
                       'EmployeeNumber': 'employee_number',
                       'EnvironmentSatisfaction': 'environment_satisfaction',
                       'Gender': 'gender',     
                       'HourlyRate': 'hourly_rate',
                       'JobInvolvement': 'job_involvement',
                       'JobLevel': 'job_level',
                       'JobRole': 'job_role',
                       'JobSatisfaction': 'job_satisfaction',
                       'MaritalStatus': 'marital_status',
                       'MonthlyIncome': 'monthly_income',
                       'MonthlyRate': 'monthly_rate',
                       'NumCompaniesWorked': 'num_companies_worked',
                       'Over18': 'over_18',
                       'OverTime': 'over_time',
                       'PercentSalaryHike': 'percent_salary_hike',
                       'PerformanceRating': 'performance_rating',
                       'RelationshipSatisfaction': 'relationship_satisfaction',
                       'StandardHours': 'standard_hours',
                       'StockOptionLevel': 'stock_option_level',
                       'TotalWorkingYears': 'total_working_years',
                       'TrainingTimesLastYear': 'training_times_last_year',
                       'WorkLifeBalance': 'work_life_balance',
                       'YearsAtCompany': 'years_at_company',
                       'YearsInCurrentRole': 'years_in_current_role',
                       'YearsSinceLastPromotion': 'years_since_last_promotion',
                       'YearsWithCurrManager': 'years_with_curr_manager'})
data.head(5)

In [None]:
# checking for null values

data.isna().sum()

There are no null values

In [None]:
# we can drop employee_number as it has no use in the model

data = data.drop('employee_number', axis = 1)
data

In [None]:
#looking into performance vs attrition to get a feel for it

attrition = data[['attrition', 'performance_rating']]
attrition

In [None]:
#split numericals and categoricals

data_cat = data.select_dtypes(include = np.object)
data_num = data.select_dtypes(include = np.number)

In [None]:
data_cat

In [None]:
data_num

In [None]:
# scale numerical features

transformer = MinMaxScaler().fit(data_num)
data_num_minmax = transformer.transform(data_num) 
data_num_norm = pd.DataFrame(data_num_minmax,columns= data_num.columns)
data_num_norm.head()

In [None]:
# encode the categorical feature

data_cat_dumm = pd.get_dummies(data_cat, drop_first = True)
data_cat_dumm.head()

In [None]:
data_cat_dumm = data_cat_dumm.rename(columns = {'attrition_Yes': 'attrition'})
data_cat_dumm

## EDA

In [None]:
# separate numerical discrete and numerical continous 

#data_num_disc = data_num['education', 'environment_satisfaction', 'job_involvement', 'job_satisfaction', 'num_companies_worked', '

In [None]:
# visualise the numericals

for column in data_num.columns:
    sns.distplot(data_num[column])
    plt.show()

In [None]:
data_num = data_num.drop(['employee_count', 'standard_hours'], axis = 1)
data_num

In [None]:
#use a correlation matrix to see the relationships between variables

corr_matrix = data_num.corr()
plt.figure(figsize = (20, 18)) # as there are a large number of features, I ahve inreased the size of the correlation matrix
sns_plot = sns.heatmap(corr_matrix, annot=True, cmap = 'coolwarm')
plt.show()


We see some highly correlated variables, such as total_working_years and job_level

Do I need to drop some of these columns then?? Or PCA?

Still need to do a pairplot
Hypothesis 
Hyperparameter training

In [None]:
# concat the dataframes

concatenated_data = pd.concat([data_num_norm, data_cat_dumm], axis = 1)
concatenated_data.head()

In [None]:
# define X and y for the model, our target is attrition

X = concatenated_data.drop('attrition', axis = 1)  
y = concatenated_data['attrition']  

## Modeling

In [None]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
len(X_train)

In [None]:
y_train

In [None]:
# use logisitc regression to train model
classification = LogisticRegression(random_state = 0).fit(X_train, y_train)

In [None]:
# predictions with testing dataset

predictions = classification.predict(X_test)

logisitc regression, decision trees, random forests, support vector machines

## Evaluate the Model

In [None]:
pd.Series(predictions).value_counts()

In [None]:
y_test.value_counts()

In [None]:
cm = confusion_matrix(predictions, y_test)
cm

In [None]:
# apply the confusion matrix

cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot = True,fmt = 'g')

There are a lot of true positives, 

In [None]:
# accuracy

classification.score(X_test, y_test)

In [None]:
# accuracy

print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, predictions)))

In [None]:
# precision 

precision = 371 / (371 + 40)
precision

In [None]:
#recall

recall = 371 / (371 + 9)
recall

In [None]:
# F1 score

F1_score = 2 * (precision * recall) / (precision + recall)
F1_score

In [None]:
# Roc Curve - done by binarizing the target

from sklearn import metrics
import matplotlib.pyplot as plt

y_pred_proba = classification.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr)

In [None]:
actual = [0, 1, 1, 1] #these are the actual values
predicted = [0, 0, 1, 1]
df = pd.DataFrame(data = {"acutal":actual, "predicted":predicted})
df

In [None]:
cm = metrics.confusion_matrix(actual, predicted)

In [None]:
sns.heatmap(cm, annot = True)

In [None]:
# classification report

report = classification_report(y_test, predictions)

print("Classification Report:\n", report)

From this report, we can observe that:

- The model performs well in predicting class 0 (attrition_Yes) with high precision, recall, and F1-score.<br>
- However, for class 1 (attrition_No), the model has a lower precision, recall, and F1-score, indicating that it struggles to correctly predict this class.<br>
- The overall accuracy of the model is 0.89, suggesting that it correctly predicts 89% of the instances in the dataset.<br>
- The macro average F1-score is 0.70, indicating a reasonable balance between precision and recall across both classes.<br>
- The weighted average F1-score is 0.87, taking into account the class imbalance and providing a weighted evaluation metric that emphasizes the larger class (class 0).<br>
- It's important to consider the specific context of the problem and the significance of precision, recall, and F1-score based on the objectives and requirements of your analysis.

The accuracy score is very high, let's check for imbalanced data

## Imbalanced data

In [None]:
#visualise the balance of attrition 'yes' vs 'no'

attrition_counts = data['attrition'].value_counts()

In [None]:
plt.bar(attrition_counts.index, attrition_counts.values)
plt.xlabel('Attrition')
plt.ylabel('Count')
plt.title('Class Distribution of Attrition')
plt.show()

We have a class imbalance - there is a much higher proportion of people who didn't leave

## SMOTE

Let's apply SMOTE to oversample the minority class

In [None]:
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X, y)

y_smote.value_counts()

In [None]:
# round 2 of logisitc regression to train model
classification_smote = LogisticRegression(random_state = 0).fit(X_smote, y_smote)

In [None]:
# predictions with testing dataset

predictions_smote = classification_smote.predict(X_test)

In [None]:
# classification report

report_smote = classification_report(y_test, predictions_smote)

print("Classification Report:\n", report_smote)

SMOTE has descreased the accuracy, so now I will apply Tomek links to perfrom undersampling

In [None]:
# Tomek links

smt = SMOTETomek(random_state = 42)
X_tomek, y_tomek = smt.fit_resample(X_smote, y_smote)

In [None]:
# round 3 of logisitc regression to train model
classification_tomek = LogisticRegression(random_state = 0).fit(X_tomek, y_tomek)

In [None]:
predictions_tomek = classification_tomek.predict(X_test)

In [None]:
report_tomek = classification_report(y_test, predictions_tomek)

print("Classification Report with Tomek links:\n", report_tomek)

## I would now like to apply different models

In [None]:
from sklearn.metrics import classification_report

def apply_models(X, y, test_size=0.2, random_state=42):
   
    # Initialize models
    models = {
        'Logistic Regression': LogisticRegression(random_state = random_state),
        'Decision Tree': DecisionTreeClassifier(random_state = random_state),
        'Random Forest': RandomForestClassifier(random_state = random_state),
        'Support Vector Machine': SVC(random_state = random_state)
    }

    # Train and evaluate each model
    results = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Store the evaluation metrics in the results dictionary
        results[model_name] = {
            'model': model,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }

        print(f"{model_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

        # Generate the classification report and print it
        report = classification_report(y_test, y_pred)
        print(f"\nClassification Report for {model_name}:\n{report}\n")

    return results

# Usage example:
results = apply_models(X, y)
