<a href="https://colab.research.google.com/github/Navdeep27/AI-ML/blob/main/employee_attrition_knn_classification_with_getdummies_preprocessor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# step 1: import the modules and frameworks

In [137]:
# KNN Classification
from pandas import read_csv
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# step 2: acquire the data

In [138]:
filename = 'https://raw.githubusercontent.com/Navdeep27/AI-ML/refs/heads/main/employee_attrition_knn_classification_train.csv'
df = read_csv(filename)
print(df)

   Age                JobRole  MonthlyIncome  JobSatisfaction  YearsAtCompany  \
0   29        Sales Executive           4800                3               4   
1   35     Research Scientist           6000                4               8   
2   40  Laboratory Technician           3400                2               6   
3   28        Sales Executive           4300                3               3   
4   45                Manager          11000                4              15   
5   25     Research Scientist           3500                1               2   
6   50                Manager          12000                4              20   
7   30        Sales Executive           5000                2               5   
8   37  Laboratory Technician           3100                2               9   
9   26     Research Scientist           4500                3               2   

   Attrition  
0          1  
1          0  
2          0  
3          1  
4          0  
5          1  
6  

# step 3: preprocess the data

In [139]:
# Convert categorical variable into dummy/indicator variables
preprocess_df = pd.get_dummies(df, columns=['JobRole'])
print(preprocess_df)

   Age  MonthlyIncome  JobSatisfaction  YearsAtCompany  Attrition  \
0   29           4800                3               4          1   
1   35           6000                4               8          0   
2   40           3400                2               6          0   
3   28           4300                3               3          1   
4   45          11000                4              15          0   
5   25           3500                1               2          1   
6   50          12000                4              20          0   
7   30           5000                2               5          0   
8   37           3100                2               9          0   
9   26           4500                3               2          1   

   JobRole_Laboratory Technician  JobRole_Manager  JobRole_Research Scientist  \
0                          False            False                       False   
1                          False            False                        True 

# step 4: inspect the data and segregate the input and output

In [140]:
inputx = preprocess_df.drop('Attrition', axis=1)
outputy = preprocess_df['Attrition']
print(inputx)
print(outputy)

   Age  MonthlyIncome  JobSatisfaction  YearsAtCompany  \
0   29           4800                3               4   
1   35           6000                4               8   
2   40           3400                2               6   
3   28           4300                3               3   
4   45          11000                4              15   
5   25           3500                1               2   
6   50          12000                4              20   
7   30           5000                2               5   
8   37           3100                2               9   
9   26           4500                3               2   

   JobRole_Laboratory Technician  JobRole_Manager  JobRole_Research Scientist  \
0                          False            False                       False   
1                          False            False                        True   
2                           True            False                       False   
3                          False     

#step 5 : Split the data for training and testing in ratio 0.7/0.3

In [141]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(inputx, outputy, test_size=0.3, random_state=41)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(X_test)

[[ 0.42066707 -0.88435671 -1.32287566  0.55337157  1.58113883 -0.40824829
  -0.63245553 -0.63245553]
 [ 0.11070186  0.28138623  1.32287566  0.31127151 -0.63245553 -0.40824829
   1.58113883 -0.63245553]
 [-1.28414158 -0.32158426  0.         -1.14132887 -0.63245553 -0.40824829
   1.58113883 -0.63245553]
 [ 0.88561489 -0.76376262 -1.32287566 -0.17292862  1.58113883 -0.40824829
  -0.63245553 -0.63245553]
 [ 1.66052791  2.29128785  1.32287566  2.00597195 -0.63245553  2.44948974
  -0.63245553 -0.63245553]
 [-0.97417637 -0.40198032  0.         -0.8992288  -0.63245553 -0.40824829
  -0.63245553  1.58113883]
 [-0.81919377 -0.20099016  0.         -0.65712874 -0.63245553 -0.40824829
  -0.63245553  1.58113883]]
[[-1.43912419 -0.72356458 -2.64575131 -1.14132887 -0.63245553 -0.40824829
   1.58113883 -0.63245553]
 [ 2.43544094  2.69326817  1.32287566  3.21647226 -0.63245553  2.44948974
  -0.63245553 -0.63245553]
 [-0.66421116 -0.1205941  -1.32287566 -0.41502868 -0.63245553 -0.40824829
  -0.63245553  1

# step 6 : select the model

In [147]:
thismodel = KNeighborsClassifier()

# step 7 : train the model

In [148]:
print("\nThe parameters of the model are\n\n",thismodel.get_params())
print(thismodel.fit(X_train,y_train))


The parameters of the model are

 {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
KNeighborsClassifier(n_neighbors=3)


# step 8: predict the test data

In [149]:
# Predict on the test set
y_pred = thismodel.predict(X_test)

# step 9: print the results

In [150]:
reslist=[]
for val in y_pred:
    if val==0:
        reslist.append("No")
    else:
        reslist.append("Yes")
print(reslist)

['Yes', 'No', 'Yes']


#step 10 : # Evaluate the model

In [151]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

