In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv("employee_attrition.csv")
print(df.head())

   Age Department  Years_At_Company  Salary Overtime  JobSatisfaction  \
0   29      Sales                12      38       No                3   
1   23    Finance                12      98       No                1   
2   39       Tech                19      46       No                1   
3   37    Finance                14      46       No                5   
4   36       Tech                16      90       No                4   

  Attrition  
0        No  
1       Yes  
2        No  
3       Yes  
4       Yes  


In [3]:
print(df.isnull().sum())

Age                 0
Department          0
Years_At_Company    0
Salary              0
Overtime            0
JobSatisfaction     0
Attrition           0
dtype: int64


In [4]:
#le=LabelEncoder()
#df['Department'] = le.fit_transform(df['Department'])
df['Overtime'] = df['Overtime'].map({'No': 0, 'Yes': 1})

print(df.head())

   Age Department  Years_At_Company  Salary  Overtime  JobSatisfaction  \
0   29      Sales                12      38         0                3   
1   23    Finance                12      98         0                1   
2   39       Tech                19      46         0                1   
3   37    Finance                14      46         0                5   
4   36       Tech                16      90         0                4   

  Attrition  
0        No  
1       Yes  
2        No  
3       Yes  
4       Yes  


In [5]:
#one hot codind for department columns
df_encoded=pd.get_dummies(df,columns=['Department'],drop_first=True,dtype=int)
print(df_encoded.head())

   Age  Years_At_Company  Salary  Overtime  JobSatisfaction Attrition  \
0   29                12      38         0                3        No   
1   23                12      98         0                1       Yes   
2   39                19      46         0                1        No   
3   37                14      46         0                5       Yes   
4   36                16      90         0                4       Yes   

   Department_HR  Department_Sales  Department_Tech  
0              0                 1                0  
1              0                 0                0  
2              0                 0                1  
3              0                 0                0  
4              0                 0                1  


In [6]:
#Split the dataset into features (X) and target (y)
X=df_encoded.drop(columns=['Overtime','Attrition'])
y=df_encoded['Attrition']	
print(X.head())
print(X.shape)
print(y.head())
print(y.shape)

   Age  Years_At_Company  Salary  JobSatisfaction  Department_HR  \
0   29                12      38                3              0   
1   23                12      98                1              0   
2   39                19      46                1              0   
3   37                14      46                5              0   
4   36                16      90                4              0   

   Department_Sales  Department_Tech  
0                 1                0  
1                 0                0  
2                 0                1  
3                 0                0  
4                 0                1  
(50, 7)
0     No
1    Yes
2     No
3    Yes
4    Yes
Name: Attrition, dtype: object
(50,)


In [7]:
#Perform train-test split (70% train, 30% test).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(35, 7)
(15, 7)
(35,)
(15,)


In [10]:
#Step 3: Model Building
#Step 3: Model Building
#	7. Train a Random Forest Classifier on the training set.
#8.	8. Print the number of trees and features used in training.
from sklearn.ensemble import RandomForestClassifier

In [11]:
model=RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)

In [12]:
print("number of trees used in the model:",model.n_estimators)
print("number of  features  used in the model:",model.n_features_in_)

number of trees used in the model: 100
number of  features  used in the model: 7


In [23]:
#Step 4: Model Evaluation:

#Predict on the test set.

y_pred=model.predict(X_test)
print("first 10 predictions on the test set:")
print(y_pred[: 10])

#Evaluate performance using accuracy, confusion matrix, and classification report.
print("Evaluation metrics:")

#accuracy score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accu=accuracy_score(y_test,y_pred)
print(f"Accuracy:{accu:.2f}")

#confusion matrix
conf_matrix= confusion_matrix(y_test,y_pred)
print("\nconfusion matrix:",conf_matrix)

#classification report
class_report=classification_report(y_test,y_pred)
print("\nclassification report:",class_report)

first 10 predictions on the test set:
['Yes' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes']
Evaluation metrics:
Accuracy:0.53

confusion matrix: [[3 4]
 [3 5]]

classification report:               precision    recall  f1-score   support

          No       0.50      0.43      0.46         7
         Yes       0.56      0.62      0.59         8

    accuracy                           0.53        15
   macro avg       0.53      0.53      0.52        15
weighted avg       0.53      0.53      0.53        15



In [24]:
#Step 5: Model Tuning
#11.	11. Experiment with n_estimators, max_depth, and criterion. Compare results.
#12.	12. Check feature importances and identify the most important predictors of attrition.

# 3. Experiment with Different Models
models_to_test = {
    "Model 1: Default": {},
    "Model 2: High Estimators": {"n_estimators": 200, "random_state": 42},
    "Model 3: Deeper Trees": {"max_depth": 10, "random_state": 42},
    "Model 4: Entropy Criterion": {"criterion": "entropy", "random_state": 42},
    "Model 5: Combination": {"n_estimators": 150, "max_depth": 8, "criterion": "entropy", "random_state": 42}
}

for name, params in models_to_test.items():
    print(f"--- Running {name} ---")
    
    # Initialize and train the model with specific hyperparameters
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate and print results
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("\n" + "="*50 + "\n")

--- Running Model 1: Default ---
Accuracy: 0.60
Confusion Matrix:
[[3 4]
 [2 6]]
Classification Report:
              precision    recall  f1-score   support

          No       0.60      0.43      0.50         7
         Yes       0.60      0.75      0.67         8

    accuracy                           0.60        15
   macro avg       0.60      0.59      0.58        15
weighted avg       0.60      0.60      0.59        15



--- Running Model 2: High Estimators ---
Accuracy: 0.53
Confusion Matrix:
[[3 4]
 [3 5]]
Classification Report:
              precision    recall  f1-score   support

          No       0.50      0.43      0.46         7
         Yes       0.56      0.62      0.59         8

    accuracy                           0.53        15
   macro avg       0.53      0.53      0.52        15
weighted avg       0.53      0.53      0.53        15



--- Running Model 3: Deeper Trees ---
Accuracy: 0.53
Confusion Matrix:
[[3 4]
 [3 5]]
Classification Report:
              pre

In [25]:
# 4. Get Feature Importances
importances = model.feature_importances_


feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values('importance', ascending=False).reset_index(drop=True)


print("Feature Importances:")
print(feature_importance_df)

Feature Importances:
            feature  importance
0  Years_At_Company    0.263652
1            Salary    0.239936
2               Age    0.214660
3   JobSatisfaction    0.174765
4     Department_HR    0.037523
5   Department_Tech    0.035920
6  Department_Sales    0.033543


In [None]:
#Which features influence employee attrition the most?
by the above analysis  Years_At_Company ,Salary , Age are the features influence employee attrition the most