In [1]:
# creating a predictive model for employee churn using advanced Python techniques.
# steps we will follow:
# loading the dataset
# data preprocessing
# feature engineering
# model selection
# training
# evaluation.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# load Data
from google.colab import files
uploaded = files.upload()

Saving 0_HR_Data_csv.csv to 0_HR_Data_csv.csv


In [7]:
# convert to dataframe
data = pd.read_csv("0_HR_Data_csv.csv")
#load to dataframe
df = pd.DataFrame(data)

In [9]:
# Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [10]:
# Split features and target variable
X = df.drop(columns=['Attrition'])
y = df['Attrition']


In [11]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [13]:
# Initialize and train Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)


In [14]:
# Predictions
y_pred = rf_classifier.predict(X_test)


In [16]:
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy: 1.0

Confusion Matrix:
 [[253   0]
 [  0  41]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       253
           1       1.00      1.00      1.00        41

    accuracy                           1.00       294
   macro avg       1.00      1.00      1.00       294
weighted avg       1.00      1.00      1.00       294



In [17]:
# achieving 100% accuracy might also indicate potential issues such as overfitting,
# especially if the dataset is relatively small or if there are data leakage problems.
# It's essential to further investigate and validate the model's performance, possibly using cross-validation
# or testing it on a separate dataset.

In [38]:
# in this case the data , being that it has already been cleaned and set up in the way that suits this kind of model ,
# in real life this data set is not found