<a href="https://colab.research.google.com/github/HoangLeKhanhHuyen/huyen_space/blob/main/Employee%20Performance%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FINAL PROJECT: EMPLOYEE PERFORMANCE ANALYSIS**


## **Import Necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.stats import kurtosis, stats, zscore
import scipy.stats as stats
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_excel("/content/Employee_Performance.xlsx")

In [None]:
df.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3


In [None]:
df.columns

Index(['EmpNumber', 'Age', 'Gender', 'EducationBackground', 'MaritalStatus',
       'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency',
       'DistanceFromHome', 'EmpEducationLevel', 'EmpEnvironmentSatisfaction',
       'EmpHourlyRate', 'EmpJobInvolvement', 'EmpJobLevel',
       'EmpJobSatisfaction', 'NumCompaniesWorked', 'OverTime',
       'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction',
       'TotalWorkExperienceInYears', 'TrainingTimesLastYear',
       'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany',
       'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'PerformanceRating'],
      dtype='object')

## **1. DATA OVERVIEW & DESCRIPTIVE STATISTICS**

### **1.1 Overview of the Dataset**

In [None]:
# Dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   EmpNumber                     1200 non-null   object
 1   Age                           1200 non-null   int64 
 2   Gender                        1200 non-null   object
 3   EducationBackground           1200 non-null   object
 4   MaritalStatus                 1200 non-null   object
 5   EmpDepartment                 1200 non-null   object
 6   EmpJobRole                    1200 non-null   object
 7   BusinessTravelFrequency       1200 non-null   object
 8   DistanceFromHome              1200 non-null   int64 
 9   EmpEducationLevel             1200 non-null   int64 
 10  EmpEnvironmentSatisfaction    1200 non-null   int64 
 11  EmpHourlyRate                 1200 non-null   int64 
 12  EmpJobInvolvement             1200 non-null   int64 
 13  EmpJobLevel       

### **1.2 Descriptive statistics**

In [None]:
# Generate descriptive statistics.
descriptive_stats = df.describe().T

# Format the numbers to display two decimal places.
formatted_stats = descriptive_stats.style.format("{:.2f}")

# Apply background gradient styling and left-align the index column.
styled_stats = formatted_stats.set_properties(**{'text-align': 'left'})
styled_stats.set_table_styles([dict(selector="th", props=[("text-align", "left")])])

# Display the styled descriptive statistics.
display(styled_stats)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1200.0,36.92,9.09,18.0,30.0,36.0,43.0,60.0
DistanceFromHome,1200.0,9.17,8.18,1.0,2.0,7.0,14.0,29.0
EmpEducationLevel,1200.0,2.89,1.04,1.0,2.0,3.0,4.0,5.0
EmpEnvironmentSatisfaction,1200.0,2.72,1.09,1.0,2.0,3.0,4.0,4.0
EmpHourlyRate,1200.0,65.98,20.21,30.0,48.0,66.0,83.0,100.0
EmpJobInvolvement,1200.0,2.73,0.71,1.0,2.0,3.0,3.0,4.0
EmpJobLevel,1200.0,2.07,1.11,1.0,1.0,2.0,3.0,5.0
EmpJobSatisfaction,1200.0,2.73,1.1,1.0,2.0,3.0,4.0,4.0
NumCompaniesWorked,1200.0,2.67,2.47,0.0,1.0,2.0,4.0,9.0
EmpLastSalaryHikePercent,1200.0,15.22,3.63,11.0,12.0,14.0,18.0,25.0


In [None]:
# Drop the redundant 'EmpNumber' column
df.drop(['EmpNumber'], axis=1, inplace = True)

## **2. EXPLORATORY DATA ANALYSIS (EDA)**

### **2.1 Unvariate Analysis**

#### *2.1.1 Categorical Variables (Distribution)*

In [None]:
# Define categorical variables (right order for visualization)
categorical_variables = ["Gender", "OverTime", "Attrition","MaritalStatus","BusinessTravelFrequency", "EducationBackground", "EmpDepartment","EmpJobRole"]

# Create subplots for categorical features (Bar Charts)
fig_categorical = make_subplots(rows=3, cols=3, subplot_titles=categorical_variables)

# Add Bar Charts for categorical features
for i, feature in enumerate(categorical_variables):
    row, col = divmod(i, 3)
    feature_counts = df[feature].value_counts().reset_index()
    feature_counts.columns = [feature, "Count"]

    fig_categorical.add_trace(go.Bar(
        x=feature_counts[feature],
        y=feature_counts["Count"],
        name=feature,
        text=feature_counts["Count"],
        textposition="inside",
        marker=dict(color="royalblue")
    ), row=row + 1, col=col + 1)
    fig_categorical.update_yaxes(showticklabels=False, row=row + 1, col=col + 1)

# Rotate xticks for 'EducationBackground'
fig_categorical.update_xaxes(tickangle=90, row=2, col=3)
# Rotate xticks for 'EmpDepartment'
fig_categorical.update_xaxes(tickangle=90, row=3, col=1)

# Update layout for categorical plots
fig_categorical.update_layout(title_text="Categorical Variable Distribution", title_font_size=24, title_font_color ='black',
                              template ="plotly_white", height =1000, width =1200, showlegend=False)
fig_categorical.show()

#### *2.1.2 Numerical Variables (Distribution & Check outliers)*

In [None]:
numerical_variables = df.select_dtypes(include=["int64"]).columns.tolist()
numerical_variables

['Age',
 'DistanceFromHome',
 'EmpEducationLevel',
 'EmpEnvironmentSatisfaction',
 'EmpHourlyRate',
 'EmpJobInvolvement',
 'EmpJobLevel',
 'EmpJobSatisfaction',
 'NumCompaniesWorked',
 'EmpLastSalaryHikePercent',
 'EmpRelationshipSatisfaction',
 'TotalWorkExperienceInYears',
 'TrainingTimesLastYear',
 'EmpWorkLifeBalance',
 'ExperienceYearsAtThisCompany',
 'ExperienceYearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager',
 'PerformanceRating']

In [None]:
# Define selected numerical columns for analysis
titles = {
    "PerformanceRating": "Performance Rating",
    "EmpEnvironmentSatisfaction": "Employee enviroment satisfaction",
    "EmpLastSalaryHikePercent": "Employee last salary hike percent",
    "EmpWorkLifeBalance": "Employee work life balance",
    "ExperienceYearsInCurrentRole": "Experience years in current role",
    "YearsWithCurrManager": "Years with current manager",
    "YearsSinceLastPromotion": "Years since last promotion",
    "ExperienceYearsAtThisCompany": "Experience years at this company",
    "EmpJobSatisfaction": "Employee job satisfaction",
    "EmpJobInvolvement": "Employee job involement",
    "EmpRelationshipSatisfaction": "Employee relationship satisfaction",
    "EmpJobLevel": "Employee job level",
    "NumCompaniesWorked": "Number of companies worked",
    "TrainingTimesLastYear": "Training times last year",
    "EmpHourlyRate": "Employee hourly rate",
    "EmpEducationLevel": "Employee education level"}


# Loop through selected columns and create histograms & boxplots
for col in titles.keys():
    fig = px.histogram(df, x=col, nbins=30, marginal="box", histnorm="probability density",
                       title=titles[col], template="plotly_white",
                       height=450, width=600)

    fig.update_layout(title_font_size=24)
    fig.update_traces(marker_color ="royalblue", marker_line_width=0.25, marker_line_color="lightgrey")
    fig.show()


### **2.2 Bivariate & Multivariate Analysis**

#### *2.2.1 Overview of Performance Rating*

In [None]:
df['PerformanceRating'].value_counts()

Unnamed: 0_level_0,count
PerformanceRating,Unnamed: 1_level_1
3,874
2,194
4,132


In [None]:
# Define new age bins and labels
print(df['Age'].min())
print(df['Age'].max())
bins = [18, 25, 35, 45, 55, 61]  #To include 60 year-old employee
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
df["AgeGroup"]= pd.cut(df["Age"], bins=bins, labels=labels, right=False)

# Group performance rating by gender and age
gender_performance = df.groupby(["Gender", "PerformanceRating"]).size().unstack().fillna(0)
age_performance = df.groupby(["AgeGroup", "PerformanceRating"]).size().unstack().fillna(0)
print(df["AgeGroup"])
print(gender_performance)
print(age_performance)

18
60
0       25-34
1       45-54
2       35-44
3       35-44
4         55+
        ...  
1195    25-34
1196    35-44
1197    45-54
1198    25-34
1199      <25
Name: AgeGroup, Length: 1200, dtype: category
Categories (5, object): ['<25' < '25-34' < '35-44' < '45-54' < '55+']
PerformanceRating    2    3   4
Gender                         
Female              75  349  51
Male               119  525  81
PerformanceRating   2    3   4
AgeGroup                      
<25                12   54  11
25-34              69  336  47
35-44              66  300  48
45-54              36  151  18
55+                11   33   8


In [None]:
# Define color mapping for each rating and max rating
rating_colors = {3: "royalblue", 2: "#B3B3B3", 4: "lightgrey"}
most_frequent_rating = df["PerformanceRating"].value_counts().idxmax()

# Create subplots (Pie chart + 2 Stacked Bar Charts)
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Overall Performance Rating",
                    "Count by Gender",
                    "Count by Age Group"),
    specs=[[{"type": "pie"}, {"type": "bar"}, {"type": "bar"}]]
)

# Chart 1 (Pie): Overall Performance Rating Distribution
performance_counts = df['PerformanceRating'].value_counts().sort_index()
colors = [rating_colors.get(rating, "lightgrey") for rating in performance_counts.index]  # Apply colors

fig.add_trace(go.Pie(
    labels=[f"Rating {rating}" for rating in performance_counts.index],
    values=performance_counts.values,
    textinfo="percent",
    marker=dict(colors=colors), showlegend= True,
), row=1, col=1)

# Chart 2 (Stacked bar): Performance Rating by Gender
for rating in gender_performance.columns:
    fig.add_trace(go.Bar(
        x=gender_performance.index,
        y=gender_performance[rating],
        name=f"Rating {rating}",
        marker=dict(color=rating_colors.get(rating, "lightgrey")),
        text=gender_performance[rating] if rating == most_frequent_rating else "",
        textposition="inside", showlegend= False,
    ), row=1, col=2)

# Chart 3 (Stacked bar): Performance Rating by Age Group
for rating in age_performance.columns:
    fig.add_trace(go.Bar(
        x=age_performance.index,
        y=age_performance[rating],
        name=f"Rating {rating}",
        marker=dict(color=rating_colors.get(rating, "lightgrey")),
        text=age_performance[rating] if rating == most_frequent_rating else "",
        textposition="inside", showlegend= False,
    ), row=1, col=3)

titletext = "<span style='color:royalblue;'><b>72.8%</b></span> of employees received a Rating 3, mainly males and ages 25-44"
# Update layout for stacked bar effect
fig.update_layout(title_text=titletext,
    template='plotly_white',
    barmode='stack', title_font_size=24, title_font_color ='black',
    legend=dict(traceorder='normal', font=dict(color="grey")),
    legend_title_text='', margin=dict(t=120),
    annotations=[
        dict(y=1.03, font=dict(size=14, color='dimgrey'), showarrow=False),
        dict(y=1.03, font=dict(size=14, color='dimgrey'), showarrow=False),
        dict(y=1.03, font=dict(size=14, color='dimgrey'), showarrow=False),
    ], height=550, width=1200,
)

# Show plot
fig.show()


#### *2.2.2 Performance Rating by Department and Job Involvement*

In [None]:
# Calculate total performance rating/ department
total_rate_deparment = df.groupby('EmpDepartment')['PerformanceRating'].sum().sort_values(ascending=False).reset_index()
print(total_rate_deparment)
# Calculate average performance rating of by department and gender
avg_rating_gender = df.groupby(['EmpDepartment', 'Gender'])['PerformanceRating'].mean().reset_index()
print(avg_rating_gender)

            EmpDepartment  PerformanceRating
0             Development               1114
1                   Sales               1067
2  Research & Development               1002
3         Human Resources                158
4                 Finance                136
5            Data Science                 61
             EmpDepartment  Gender  PerformanceRating
0             Data Science  Female           3.000000
1             Data Science    Male           3.083333
2              Development  Female           3.098592
3              Development    Male           3.077626
4                  Finance  Female           2.681818
5                  Finance    Male           2.851852
6          Human Resources  Female           3.058824
7          Human Resources    Male           2.864865
8   Research & Development  Female           2.945736
9   Research & Development    Male           2.906542
10                   Sales  Female           2.840764
11                   Sales    Male   

In [None]:
# Show top 3 highest rating in pie charts
top_departments = total_rate_deparment.head(3)
# "Others" not important departments
others_value = total_rate_deparment.iloc[3:]["PerformanceRating"].sum()
others_row = pd.DataFrame([{"EmpDepartment": "Others", "PerformanceRating": others_value}])
# Combine data
pie_data = pd.concat([top_departments, others_row])

# Adjust the x ticks in the bar chart
avg_rating_gender["EmpDepartment"] = avg_rating_gender["EmpDepartment"].replace({
    "Data Science": "Data<br>Science",
    "Human Resources": "Human<br>Resources",
    "Research & Development": "Research &<br>Development"
})

In [None]:
# Create Subplots
fig = make_subplots(rows=1, cols=2,
    subplot_titles=["Top Total Rating Departments", "Average Performance Rating by Department & Gender"],
    specs=[[{"type": "domain"}, {"type": "bar"}]], horizontal_spacing=0.05)

# Chart 1 (Pie): Total performance rating/ department
color =["royalblue", "cornflowerblue", "silver", "lightgrey"]
fig.add_trace(go.Pie(labels=pie_data["EmpDepartment"], values=pie_data["PerformanceRating"],
    textinfo="label+percent", marker_colors = color, showlegend= False,
    domain=dict(x=[0, 0.4], y=[0.3, 0.7])
), row=1, col=1)

# Chart 2 (Stacked Bar): Average performance rating by department & gender
genders = avg_rating_gender["Gender"].unique()
colors = {"Male": "royalblue", "Female": "silver"}

for gender in genders:
    gender_data = avg_rating_gender[avg_rating_gender["Gender"] == gender]

    fig.add_trace(go.Bar(
        x=gender_data["EmpDepartment"],
        y=gender_data["PerformanceRating"],
        name=gender,
        text=gender_data["PerformanceRating"].round(2),
        textposition='auto',
        marker=dict(color=colors[gender])
    ), row=1, col=2)

title = "<b><span style='color:royalblue;'>Development department has highest performance rating"
# Update the layout
fig.update_layout(title_text= title, title_font_size=24.5, title_font_color ="black",
    yaxis_title=" ", template="plotly_white",
    height=550, width=1100, barmode="stack", bargap=0.3,
    yaxis=dict(titlefont=dict(size=12, color="grey")), margin=dict(l=5, t=185),
    annotations=[
        dict(y=1.03, font=dict(size=15, color='dimgrey'), showarrow=False),
        dict(y=1.03, font=dict(size=15, color='dimgrey'), showarrow=False)],
)

# Add subtitle
fig.add_annotation(
    text="No clear trend of performance rating based on gender",
    xref="paper", yref="paper",
    x=0.0476, y=1.3,
    showarrow=False,
    font=dict(size=23, color="black"),
)


fig.update_xaxes(tickfont=dict(color="dimgrey"), row=1,col=2)
fig.update_yaxes(tickfont=dict(color="dimgrey"), row=1,col=2)
fig.show()


In [None]:
# Calculate average performance rating by department and job involvement
avg_rating_involve = df.groupby(['EmpDepartment', 'EmpJobInvolvement'])['PerformanceRating'].mean().reset_index()
print(avg_rating_involve)

             EmpDepartment  EmpJobInvolvement  PerformanceRating
0             Data Science                  2           3.333333
1             Data Science                  3           3.000000
2             Data Science                  4           3.000000
3              Development                  1           3.176471
4              Development                  2           3.130952
5              Development                  3           3.059633
6              Development                  4           3.095238
7                  Finance                  1           3.000000
8                  Finance                  2           2.833333
9                  Finance                  3           2.720000
10                 Finance                  4           2.800000
11         Human Resources                  1           3.500000
12         Human Resources                  2           2.928571
13         Human Resources                  3           2.843750
14         Human Resource

In [None]:
fig = px.line(avg_rating_involve,
              x="EmpJobInvolvement", y="PerformanceRating",
              facet_col="EmpDepartment", line_group="EmpDepartment",
               markers=True,  title="General trend: as <b>job involvement rating increases, performance rating decreases<b>",
                category_orders={"EmpDepartment": avg_rating_involve["EmpDepartment"].unique()}
)

# Update facet titles
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

# Define custom colors for lines
colors = ["midnightblue", "slateblue", "royalblue", "cornflowerblue", "lightsteelblue", "turquoise"]

# Apply custom colors to each department
for i, trace in enumerate(fig.data):
    trace.line.color = colors[i % len(colors)]
    trace.marker.color = colors[i % len(colors)]

# Add Vertical Lines
for i in range(1, 5):  # Job Involvement values 1-4
    fig.add_vline(x=i, line_width=1, line_dash="dot", line_color="lightgrey")

# Update layout for better spacing
fig.update_layout(template="plotly_white", title_font_size=24, height=450, width=1200, margin=dict(r=30, t=130),
                  yaxis_title="Avg. Performance Rating", showlegend=False)
fig.update_xaxes(title_text=None)
fig.update_yaxes(title_font_color="dimgrey")

# Add subtitle (adjust position slightly down & aligned with title)
fig.add_annotation(
    text="EmpDepartment / Emp Job Involvement",
    xref="paper", yref="paper",
    x=-0.018, y=1.23,
    showarrow=False,
    font=dict(size=16, color="dimgrey")
)

fig.show()


#### *2.2.3 Employee Performance & Workplace Factors: Job Level, Work-Life Balance & Job Satisfaction*

In [None]:
# Calculate average performance rating by Emp Job Level
ave_rate_joblev = df.groupby('EmpJobLevel')['PerformanceRating'].mean().round(2).sort_values(ascending=False).reset_index()
print(ave_rate_joblev)

# Calculate average performance rating by Emp Work Life balance
avg_rating_balance = df.groupby(['EmpWorkLifeBalance'])['PerformanceRating'].mean().round(2).sort_values(ascending=False).reset_index()
print(avg_rating_balance)

# Calculate average performance rating by Emp Environment Satisfaction
avg_rating_satisfaction = df.groupby(['EmpEnvironmentSatisfaction'])['PerformanceRating'].mean().round(2).sort_values(ascending=False).reset_index()
print(avg_rating_satisfaction)

   EmpJobLevel  PerformanceRating
0            1               2.99
1            2               2.95
2            3               2.89
3            4               2.89
4            5               2.88
   EmpWorkLifeBalance  PerformanceRating
0                   4               3.12
1                   3               2.95
2                   2               2.92
3                   1               2.75
   EmpEnvironmentSatisfaction  PerformanceRating
0                           3               3.14
1                           4               3.13
2                           1               2.67
3                           2               2.65


In [None]:
# Create subplots
fig = make_subplots(rows=1, cols=3,
                    subplot_titles=("Emp Job Level", "Emp Work Life Balance", "Emp Environment Satisfaction"))

# Colors
colors = ['mediumvioletred', 'midnightblue', 'royalblue']

# Chart 1: Job Level
fig.add_trace(go.Bar(x=ave_rate_joblev["EmpJobLevel"], y=ave_rate_joblev["PerformanceRating"], marker_color=colors[0],
                     text=ave_rate_joblev["PerformanceRating"], textposition='inside', name="Job Level"), row=1, col=1)
fig.update_xaxes(tickvals=ave_rate_joblev["EmpJobLevel"].unique(), ticktext=ave_rate_joblev["EmpJobLevel"].unique(),
    categoryorder="array",categoryarray=ave_rate_joblev["EmpJobLevel"].unique(),
    row=1, col=1)

# Chart 2: Work Life Balance
fig.add_trace(go.Bar(x=avg_rating_balance["EmpWorkLifeBalance"], y=avg_rating_balance["PerformanceRating"], marker_color=colors[1],
                     text=avg_rating_balance["PerformanceRating"], textposition='inside', name="Work Life Balance"), row=1, col=2)

# Chart 3: Environment Satisfaction
fig.add_trace(go.Bar(x=avg_rating_satisfaction["EmpEnvironmentSatisfaction"], y=avg_rating_satisfaction["PerformanceRating"], marker_color=colors[2],
                     text=avg_rating_satisfaction["PerformanceRating"], textposition='inside', name="Environment Satisfaction"), row=1, col=3)

# Update layout
fig.update_layout(template="plotly_white", margin=dict(l=90, t=160), showlegend=False, height=500, width=1100,
                     annotations=[
                        dict(y=1.03, font=dict(size=15, color='dimgrey'), showarrow=False),
                        dict(y=1.03, font=dict(size=15, color='dimgrey'), showarrow=False)],
)

# Add 2 titles
fig.add_annotation(
    text="Higher job levels show lower performance, possibly due to work pressure/ low motivation,",
    xref="paper", yref="paper",
    x=0.005, y=1.45,
    showarrow=False,
    font=dict(size=19, color="black"),
)
fig.add_annotation(
    text="while <b><span style='color:royalblue;'>better work-life balance & environment satisfaction</span></b> show <span style='color:royalblue;'><b>higher performance",
    xref="paper", yref="paper",
    x=0.005, y=1.35,
    showarrow=False,
    font=dict(size=19, color="black"),
)

# Y-axis styling
fig.update_yaxes(title_text="Avg. Performance Rating", showticklabels=True, title_font_color="dimgrey", row=1, col=1)
fig.update_yaxes(showticklabels=False, row=1, col=2)
fig.update_yaxes(showticklabels=False, row=1, col=3)


fig.show()

In [None]:
# Calculate average performance rating by department and job level
avg_rating_departlevel = df.groupby(['EmpDepartment', 'EmpJobLevel'])['PerformanceRating'].mean().reset_index()
print(avg_rating_departlevel)

             EmpDepartment  EmpJobLevel  PerformanceRating
0             Data Science            1           3.125000
1             Data Science            2           3.142857
2             Data Science            3           3.000000
3             Data Science            4           3.000000
4             Data Science            5           2.500000
5              Development            1           3.111765
6              Development            2           3.116505
7              Development            3           3.021739
8              Development            4           3.041667
9              Development            5           2.888889
10                 Finance            1           2.750000
11                 Finance            2           2.894737
12                 Finance            3           2.750000
13                 Finance            4           2.600000
14                 Finance            5           2.000000
15         Human Resources            1           2.8076

In [None]:
fig = px.line(avg_rating_departlevel,
              x="EmpJobLevel", y="PerformanceRating",
              facet_col="EmpDepartment", line_group="EmpDepartment",
               markers=True,  title="General trend: <b>Higher job levels lead to lower performance rating<b>",
                category_orders={"EmpDepartment": avg_rating_involve["EmpDepartment"].unique()}
)

# Update facet titles
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

# Define custom colors for lines
colors = ["midnightblue", "slateblue", "royalblue", "cornflowerblue", "lightsteelblue", "turquoise"]

# Apply custom colors to each department
for i, trace in enumerate(fig.data):
    trace.line.color = colors[i % len(colors)]
    trace.marker.color = colors[i % len(colors)]

# Add Vertical Lines
for i in range(1, 5):  # Job Involvement values 1-4
    fig.add_vline(x=i, line_width=1, line_dash="dot", line_color="lightgrey")

# Update layout for better spacing
fig.update_layout(template="plotly_white", title_font_size=24, height=450, width=1200, margin=dict(r=30, t=130),
                  yaxis_title="Avg. Performance Rating", showlegend=False)
fig.update_xaxes(title_text=None)
fig.update_yaxes(title_font_color="dimgrey")

# Add subtitle (adjust position slightly down & aligned with title)
fig.add_annotation(
    text="EmpDepartment / Emp Job Level",
    xref="paper", yref="paper",
    x=-0.018, y=1.23,
    showarrow=False,
    font=dict(size=16, color="dimgrey")
)

fig.show()


#### *2.2.4 Employee Performance & Career Growth, Workload: Promotions, Salary Hikes & Overtime*

In [None]:
# Calculate average performance rating by years since last promoted
avg_rating_promote = df.groupby(['YearsSinceLastPromotion'])['PerformanceRating'].mean().reset_index()
print(avg_rating_promote)
# Calculate average performance rating by salary hike and gender
avg_rating_salarysex = df.groupby(['EmpLastSalaryHikePercent','Gender'])['PerformanceRating'].mean().reset_index()
print(avg_rating_salarysex)

    YearsSinceLastPromotion  PerformanceRating
0                         0           3.123667
1                         1           2.898990
2                         2           2.763780
3                         3           2.777778
4                         4           2.773585
5                         5           2.828571
6                         6           2.833333
7                         7           2.854839
8                         8           2.545455
9                         9           2.687500
10                       10           2.600000
11                       11           2.826087
12                       12           2.666667
13                       13           3.125000
14                       14           3.000000
15                       15           2.909091
    EmpLastSalaryHikePercent  Gender  PerformanceRating
0                         11  Female           2.828571
1                         11    Male           2.848485
2                         12  Fem

In [None]:
# Create the figure
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=("Years Since Last Promotion", "Emp Last Salary Hike (%)"))


# Chart 1: Years since last promotion
fig.add_trace(go.Scatter(x=avg_rating_promote["YearsSinceLastPromotion"], y=avg_rating_promote["PerformanceRating"],
    mode="lines+markers", line=dict(color="royalblue", width=2.5), marker=dict(size=5), showlegend=False,
))

# Identify important points (min and max)
min_point = avg_rating_promote.loc[avg_rating_promote["PerformanceRating"].idxmin()]
max_point = avg_rating_promote.loc[avg_rating_promote["PerformanceRating"].idxmax()]
important_years = [min_point["YearsSinceLastPromotion"], max_point["YearsSinceLastPromotion"]]
important_values = [min_point["PerformanceRating"], max_point["PerformanceRating"]]

# Add larger markers for important points
fig.add_trace(go.Scatter(
    x=important_years,
    y=important_values,
    mode="markers", showlegend=False,
    marker=dict(size=12, color="crimson"),
))

# Adjust the x and y ticks
fig.update_xaxes(tickvals=avg_rating_promote["YearsSinceLastPromotion"].unique(), ticktext=avg_rating_promote["YearsSinceLastPromotion"].unique(),
    categoryorder="array",categoryarray=avg_rating_promote["YearsSinceLastPromotion"].unique(), title_text="Years",
    row=1, col=1)
fig.update_yaxes(range=[0,4], tickvals=[0,1,2,3,4], ticktext=[0,1,2,3,4], title_text="Ave. Performance Rating",
    row=1, col=1)

# Chart 2: Emp Last Salary Hike Percent
genders = avg_rating_salarysex["Gender"].unique()
colors = {"Male": "royalblue", "Female": "silver"}

for gender in genders:
    gender_data = avg_rating_salarysex[avg_rating_salarysex["Gender"] == gender]

    fig.add_trace(go.Scatter(
        x=gender_data["EmpLastSalaryHikePercent"],  # X-axis
        y=gender_data["PerformanceRating"],  # Y-axis
        name=gender,
        fill="tozeroy",
        mode="lines",
        line=dict(color=colors[gender]),
        text=gender_data["PerformanceRating"].round(2),
        hoverinfo="text+x+y"
    ),row=1, col=2)
# Adjust y ticks
fig.update_xaxes(tickvals=avg_rating_salarysex["EmpLastSalaryHikePercent"].unique(), ticktext=avg_rating_salarysex["EmpLastSalaryHikePercent"].unique(),
    categoryorder="array",categoryarray=avg_rating_salarysex["EmpLastSalaryHikePercent"].unique(), title_text="Percent",
    row=1, col=2)
fig.update_yaxes(range=[0,4], tickvals=[0,1,2,3,4], ticktext=[0,1,2,3,4],
    row=1, col=2)

# Update layout
fig.update_layout(template="plotly_white", width=1200, height=500,
    margin=dict(l=90, t=150),                     annotations=[
                        dict(y=1.03, font=dict(size=15, color='dimgrey'), showarrow=False),
                        dict(y=1.03, font=dict(size=15, color='dimgrey'), showarrow=False)])

# Add 2 titles
fig.add_annotation(
    text="<b>Higher salary hikes (20-25%) link to better performance</b>, or vice versa?",
    xref="paper", yref="paper",
    x=0.005, y=1.45,
    showarrow=False,
    font=dict(size=20, color="black"),
)
fig.add_annotation(
    text="Recently promoted employees show higher motivation and performance",
    xref="paper", yref="paper",
    x=0.005, y=1.35,
    showarrow=False,
    font=dict(size=20, color="black"),
)

fig.show()


In [None]:
# Calculate average performance rating by department and overtime
avg_rating_overtime = df.groupby(['EmpDepartment', 'OverTime'])['PerformanceRating'].mean().reset_index()
print(avg_rating_overtime)

             EmpDepartment OverTime  PerformanceRating
0             Data Science       No           3.000000
1             Data Science      Yes           3.333333
2              Development       No           3.107884
3              Development      Yes           3.041667
4                  Finance       No           2.810811
5                  Finance      Yes           2.666667
6          Human Resources       No           2.850000
7          Human Resources      Yes           3.142857
8   Research & Development       No           2.894737
9   Research & Development      Yes           2.989583
10                   Sales       No           2.830189
11                   Sales      Yes           2.935185


In [None]:
# Create figure
fig = go.Figure()

# Define colors for 'No' and 'Yes'
colors = {"No": "silver", "Yes": "royalblue"}

# Add bars for each category
for overtime in avg_rating_overtime["OverTime"].unique():
    subset = avg_rating_overtime[avg_rating_overtime["OverTime"] == overtime]
    fig.add_trace(go.Bar(
        x=subset["EmpDepartment"],
        y=subset["PerformanceRating"],
        name=overtime,
        text=subset["PerformanceRating"].round(4),
        textposition="auto",
        marker_color=colors[overtime]
    ))

# Layout adjustments
fig.update_layout(template="plotly_white",
    barmode="group",
    title="<b>Overtime boosts performance</b>, except in Finance & Development",
    yaxis_title="Avg Performance Rating", title_font_size= 24,title_font_color="black",
    yaxis=dict(tickvals=[0, 1, 2, 3, 4], ticktext=[0, 1, 2, 3, 4]),
    height=500, width=1000,
)
# Add subtitle (adjust position slightly down & aligned with title)
fig.add_annotation(
    text="EmpDepartment / Overtime",
    xref="paper", yref="paper",
    x=-0.037, y=1.09,
    showarrow=False,
    font=dict(size=17, color="dimgrey")
)

fig.show()

### **2.3 Correlation heatmap (Numrical variables)**

In [None]:
corr_matrix = df[numerical_variables].corr().round(2)
fig = px.imshow(corr_matrix, text_auto=True, title="Correlation Heatmap", aspect="auto",
                 template="plotly_white", color_continuous_scale = "YlGnBu")

fig.update_layout(title_font_size=24, height=750, width=1200)
fig.show()

### **2.4 Skewness and Kurtosis (Numerical Variables)**

In [None]:
# Check skewness and kurtosis for numerical variables
skewness_kutosis = df[numerical_variables].agg(["skew", "kurtosis"]).transpose()

# Highlight the highly skewed variables
def highlight_selected_rows(s):
    if s.name in ["ExperienceYearsAtThisCompany", "YearsSinceLastPromotion"]:
        return ['background-color: #7B9EE6; color: black'] * len(s)
    return [''] * len(s)

# Apply styling
styled_df = skewness_kutosis.style.apply(highlight_selected_rows, axis=1).set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'left')]},  # Align headers
     {'selector': 'td', 'props': [('text-align', 'left')]}]  # Align values
)

display(styled_df)

Unnamed: 0,skew,kurtosis
Age,0.384145,-0.431
DistanceFromHome,0.962956,-0.242017
EmpEducationLevel,-0.250974,-0.635594
EmpEnvironmentSatisfaction,-0.307665,-1.205577
EmpHourlyRate,-0.035165,-1.186891
EmpJobInvolvement,-0.557846,0.36867
EmpJobLevel,1.024053,0.386338
EmpJobSatisfaction,-0.324276,-1.223147
NumCompaniesWorked,1.048635,0.068863
EmpLastSalaryHikePercent,0.808654,-0.299741


## **3. Data Preprocessing**

### **3.1 Check missing values**

In [None]:
# Check null in the dataset
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
EducationBackground,0
MaritalStatus,0
EmpDepartment,0
EmpJobRole,0
BusinessTravelFrequency,0
DistanceFromHome,0
EmpEducationLevel,0
EmpEnvironmentSatisfaction,0


### **3.2 Handle outliers**

#### *Detect outliers*

In [None]:
# Detect outliers using IQR
outliers_iqr = {}
for col in df[numerical_variables].columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_iqr[col] = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col].count()

# Display outliers count per column
outliers_df = pd.DataFrame({'IQR Outliers': outliers_iqr})
print("Outlier counts using IQR:\n", outliers_df)

Outlier counts using IQR:
                               IQR Outliers
Age                                      0
DistanceFromHome                         0
EmpEducationLevel                        0
EmpEnvironmentSatisfaction               0
EmpHourlyRate                            0
EmpJobInvolvement                        0
EmpJobLevel                              0
EmpJobSatisfaction                       0
NumCompaniesWorked                      39
EmpLastSalaryHikePercent                 0
EmpRelationshipSatisfaction              0
TotalWorkExperienceInYears              51
TrainingTimesLastYear                  188
EmpWorkLifeBalance                       0
ExperienceYearsAtThisCompany            56
ExperienceYearsInCurrentRole            16
YearsSinceLastPromotion                 88
YearsWithCurrManager                    11
PerformanceRating                      326


#### *Handle outliers*

In [None]:
# List of numerical columns with outliers
# Exclude PerformanceRating (target), ExperienceYearsAtThisCompany & YearsSinceLastPromotion (skew data, will be applied log transformation later))
outlier_cols = [
    "NumCompaniesWorked", "TotalWorkExperienceInYears", "TrainingTimesLastYear",
    "ExperienceYearsInCurrentRole", "YearsWithCurrManager"]

# Handle outliers using median replacement
for col in outlier_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median_value = df[col].median()
    # Replace values above upper bound and below lower bound with median
    df.loc[df[col] > upper_bound, col] = median_value
    df.loc[df[col] < lower_bound, col] = median_value



### **3.3 Encode Categorical Variables**

In [None]:
categorical_variable = df.select_dtypes(include=['object']).columns
# Encode using LabelEncoder
enc = LabelEncoder()
for col in categorical_variable:
    df[col] = enc.fit_transform(df[col])

df.head()

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating,AgeGroup
0,32,1,2,2,5,13,2,10,3,4,...,10,2,2,10,7,0,8,0,3,25-34
1,47,1,2,2,5,13,2,14,4,4,...,20,2,3,7,7,1,7,0,3,45-54
2,40,1,1,1,5,13,1,5,4,4,...,20,2,3,18,13,1,12,0,4,35-44
3,41,1,0,0,3,8,2,10,4,2,...,23,2,2,21,6,12,6,0,3,35-44
4,60,1,2,2,5,13,2,16,4,1,...,10,1,3,2,2,2,2,0,3,55+


### **3.4 Feature Transformation**

#### *3.4.1 Log transform "YearsSinceLastPromotion" column*

In [None]:
# Apply Log Transformations
df["YearsSinceLastPromotion_logtransform"] = np.log1p(df["YearsSinceLastPromotion"])

# Function to get Q-Q plot data
def qq_plot_data(data):
    (osm, osr), _ = stats.probplot(data, dist="norm")
    return pd.DataFrame({"Theoretical Quantiles": osm, "Ordered Values": osr})

# Generate Q-Q Data
qq_log = qq_plot_data(df["YearsSinceLastPromotion_logtransform"])

# Create Q-Q Plots using Plotly
fig_log = px.scatter(qq_log, x="Theoretical Quantiles", y="Ordered Values",
                     title="Q-Q Plot: Years Since Last Promotion", trendline="ols", color_discrete_sequence=["royalblue"])
fig_log.update_layout(template="plotly_white", title_font_size=22, height=500, width=600)

fig_log.show()


#### *3.4.2 Log transform "ExperienceYearsAtThisCompany" column*

In [None]:
# Apply Log Transformation
df["ExperienceYearsAtThisCompany_logtransform"] = np.log1p(df["ExperienceYearsAtThisCompany"])

# Function to get Q-Q plot data
def qq_plot_data(data):
    (osm, osr), _ = stats.probplot(data, dist="norm")
    return pd.DataFrame({"Theoretical Quantiles": osm, "Ordered Values": osr})

# Generate Q-Q Data
qq_log = qq_plot_data(df["ExperienceYearsAtThisCompany_logtransform"])

# Create Q-Q Plots using Plotly
fig_log = px.scatter(qq_log, x="Theoretical Quantiles", y="Ordered Values",
                     title="Q-Q Plot: Experience Years At This Company", trendline="ols")
fig_log.update_layout(template="plotly_white", title_font_size=22, height=500, width=600)

fig_log.show()


### **3.5 Feature Selection**

In [None]:
# Drop redundant columns
df.drop(["YearsSinceLastPromotion", "ExperienceYearsAtThisCompany", "AgeGroup"], axis = 1, inplace=True)

In [None]:
# Correlation matrix after encoding
cor_matrix = df.corr().round(2)

# Taking columns with correlation with "PerformanceRating" >= 0.1
correlated_cols = cor_matrix.index[abs(cor_matrix['PerformanceRating']) >= 0.1].tolist()
print(correlated_cols)

# Get column indices in the df
col_indices = [df.columns.get_loc(col) for col in correlated_cols]
print(col_indices)

['EmpDepartment', 'EmpJobRole', 'EmpEnvironmentSatisfaction', 'EmpLastSalaryHikePercent', 'EmpWorkLifeBalance', 'ExperienceYearsInCurrentRole', 'YearsWithCurrManager', 'PerformanceRating', 'YearsSinceLastPromotion_logtransform', 'ExperienceYearsAtThisCompany_logtransform']
[4, 5, 9, 16, 20, 21, 22, 24, 25, 26]


In [None]:
cor_matrix.iloc[:,-3]

Unnamed: 0,PerformanceRating
Age,-0.04
Gender,-0.0
EducationBackground,0.01
MaritalStatus,0.02
EmpDepartment,-0.16
EmpJobRole,-0.1
BusinessTravelFrequency,-0.03
DistanceFromHome,-0.05
EmpEducationLevel,0.02
EmpEnvironmentSatisfaction,0.4


### *Top 3 Important Factors effecting employee performance*

In [None]:
# Sort top 3 factors affecting employee performance
top_3_correlated = cor_matrix['PerformanceRating'].drop('PerformanceRating').abs().sort_values(ascending=False).head(3)

# Get the column names
top_3_features = top_3_correlated.index.tolist()
print("Top 3 Important Factors affecting employee performance:", top_3_features)

Top 3 Important Factors affecting employee performance: ['EmpEnvironmentSatisfaction', 'EmpLastSalaryHikePercent', 'YearsSinceLastPromotion_logtransform']


## **4. MACHINE LEARNING MODEL CREATION & EVALUATION**

#### *Train Test Split*

In [None]:
# Split Data
# Choose important features from the correlation matrix (Excep index 24, which is the target)
X = df.iloc[:, [4, 5, 9, 16, 20, 21, 22, 25, 26]]
y = df.PerformanceRating

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### **4.1 Support Vector Machine (SVM)**

In [None]:
# SVM pipeline
svm_pipeline = imbpipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', SVC(kernel='rbf', C=100, class_weight="balanced", random_state=10))
])

# Training the model
svm_pipeline.fit(X_train, y_train)

# Predicting the model
y_pred_svm = svm_pipeline.predict(X_test)

# Print accuracy, precision, recall and confusion matrix
print("SVM Accuracy:", round(accuracy_score(y_test, y_pred_svm), 4))
print(classification_report(y_test,y_pred_svm))
svm_cm = confusion_matrix(y_test, y_pred_svm)
print(svm_cm)


SVM Accuracy: 0.8708
              precision    recall  f1-score   support

           2       0.76      0.86      0.81        29
           3       0.93      0.90      0.91       184
           4       0.63      0.70      0.67        27

    accuracy                           0.87       240
   macro avg       0.77      0.82      0.80       240
weighted avg       0.88      0.87      0.87       240

[[ 25   4   0]
 [  8 165  11]
 [  0   8  19]]


In [None]:
# Define actual class labels
class_labels = [2, 3, 4]

# Convert confusion matrix to DataFrame with correct labels
svm_cm_df = pd.DataFrame(svm_cm, index=[str(i) for i in class_labels], columns=[str(i) for i in class_labels])

# Visualizing confusion matrix
fig = px.imshow(svm_cm_df, text_auto=True, color_continuous_scale='Blues',
                labels={'x': 'Predicted', 'y': 'Actual'},title="<b>SVM Confusion Matrix")
fig.update_layout(height=500, width=500)
fig.show()



#### **4.2 Random Forest (RF)**

In [None]:
# Randomforest pipeline
rf_pipeline = imbpipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(n_estimators=200, min_samples_leaf=1, min_samples_split=2,
                                     criterion='gini', random_state=33, class_weight="balanced", n_jobs=-1))
])

# Training the model
rf_pipeline.fit(X_train, y_train)

# Predicting the model
y_pred_rf = rf_pipeline.predict(X_test)

# Print accuracy, precision, recall and confusion matrix
print("Random Forest Accuracy:", round(accuracy_score(y_test, y_pred_rf),4))
print(classification_report(y_test,y_pred_rf))
rf_cm = confusion_matrix(y_test, y_pred_rf)
print(rf_cm)



Random Forest Accuracy: 0.9542
              precision    recall  f1-score   support

           2       0.93      0.93      0.93        29
           3       0.95      0.99      0.97       184
           4       1.00      0.74      0.85        27

    accuracy                           0.95       240
   macro avg       0.96      0.89      0.92       240
weighted avg       0.96      0.95      0.95       240

[[ 27   2   0]
 [  2 182   0]
 [  0   7  20]]


In [None]:
# Define actual class labels
class_labels = [2, 3, 4]

# Convert confusion matrix to DataFrame with correct labels
rf_cm_df = pd.DataFrame(rf_cm, index=[str(i) for i in class_labels], columns=[str(i) for i in class_labels])

# Visualizing confusion matrix
fig = px.imshow(rf_cm_df, text_auto=True, color_continuous_scale='Blues',
                labels={'x': 'Predicted', 'y': 'Actual'}, title="<b>Random Forest Confusion Matrix")
fig.update_layout(height=500, width=500)
fig.show()


#### **4.3 Balanced Random Forest (BRF)**

In [None]:
# BRF pipeline
brf_pipeline = imbpipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', BalancedRandomForestClassifier(sampling_strategy = "not majority", n_estimators=200, max_depth=5,
                                             replacement =True, bootstrap =False, random_state=10))
])

# Training the model
brf_pipeline.fit(X_train, y_train)

# Predicting the model
y_pred_brf = brf_pipeline.predict(X_test)

# Print accuracy, precision, recall and confusion matrix
print("BRF Accuracy:", round(accuracy_score(y_test, y_pred_brf), 4))
print(classification_report(y_test, y_pred_brf))
brf_cm = confusion_matrix(y_test, y_pred_brf)
print(brf_cm)

BRF Accuracy: 0.9542
              precision    recall  f1-score   support

           2       0.96      0.90      0.93        29
           3       0.95      0.99      0.97       184
           4       1.00      0.74      0.85        27

    accuracy                           0.95       240
   macro avg       0.97      0.88      0.92       240
weighted avg       0.96      0.95      0.95       240

[[ 26   3   0]
 [  1 183   0]
 [  0   7  20]]


In [None]:
# Define actual class labels
class_labels = [2, 3, 4]

# Convert confusion matrix to DataFrame with correct labels
brf_cm_df = pd.DataFrame(brf_cm, index=[str(i) for i in class_labels], columns=[str(i) for i in class_labels])

# Visualizing confusion matrix
fig = px.imshow(brf_cm_df, text_auto=True, color_continuous_scale='Blues',
                labels={'x': 'Predicted', 'y': 'Actual'},title="<b>Balanced Random Forest Confusion Matrix")
fig.update_layout(height=500, width=500)
fig.show()



#### **4.4 Artificial Neural Network [MLP Classifier]**

In [None]:
# ANN pipeline
ann_pipeline = imbpipeline(steps=[
    ('smote', SMOTE(random_state=10)),
    ('scaler', StandardScaler()),
    ('model', MLPClassifier(hidden_layer_sizes=(100, 100), batch_size=50,
                            learning_rate_init=0.01, max_iter=2000, random_state=10))
])

# Training the model
ann_pipeline.fit(X_train, y_train)

# Predicting the model
y_pred_ann = ann_pipeline.predict(X_test)

# Print accuracy, precision, recall and confusion matrix
print("ANN Accuracy:", round(accuracy_score(y_test, y_pred_ann), 4))
print(classification_report(y_test,y_pred_ann))
ann_cm = confusion_matrix(y_test, y_pred_ann)
print(ann_cm)



ANN Accuracy: 0.8958
              precision    recall  f1-score   support

           2       0.85      0.79      0.82        29
           3       0.94      0.94      0.94       184
           4       0.66      0.70      0.68        27

    accuracy                           0.90       240
   macro avg       0.82      0.81      0.81       240
weighted avg       0.90      0.90      0.90       240

[[ 23   3   3]
 [  4 173   7]
 [  0   8  19]]


In [None]:
# Define actual class labels
class_labels = [2, 3, 4]

# Convert confusion matrix to DataFrame with correct labels
ann_cm_df = pd.DataFrame(ann_cm, index=[str(i) for i in class_labels], columns=[str(i) for i in class_labels])

# Visualizing confusion matrix
fig = px.imshow(ann_cm_df, text_auto=True, color_continuous_scale='Blues',
                labels={'x': 'Predicted', 'y': 'Actual'},title="<b>ANN Confusion Matrix")
fig.update_layout(height=500, width=500)
fig.show()



### **4.4 Comparing accuracy score**

In [None]:
# Define accuracy scores
models = ['SVM', 'RandomForest', 'BalancedRandomForest', 'ANN']
accuracy = [0.8708, 0.9542, 0.9542, 0.8958]

# Convert accuracy to percentage
accuracy_pct = [val * 100 for val in accuracy]

# Visualizing accuracy score
fig = go.Figure()
fig.add_trace(go.Bar(x=models, y=accuracy,
    marker_color='royalblue',
    text=[f"{val:.2f}%" for val in accuracy_pct],
    textposition='inside'))

# Update layout
fig.update_layout(template='plotly_white',
    title_text="<b>Accuracy comparison of the four models", title_font_size=24,
    height=500, width=700, showlegend=False,
    yaxis=dict(showticklabels=False),
    yaxis_title="Accuracy (%)")

fig.show()
