<a href="https://colab.research.google.com/github/Navdeep27/AI-ML/blob/main/employee_attrition_knn_classification_with_one_hot_encoder_preprocessor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# step 1: import the modules and frameworks

In [None]:
# KNN Classification
from pandas import read_csv
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# step 2: acquire the data

In [None]:
filename = 'https://raw.githubusercontent.com/Navdeep27/AI-ML/refs/heads/main/employee_attrition_knn_classification_train.csv'
df = read_csv(filename)
print(df)

   Age                JobRole  MonthlyIncome  JobSatisfaction  YearsAtCompany  \
0   29        Sales Executive           4800                3               4   
1   35     Research Scientist           6000                4               8   
2   40  Laboratory Technician           3400                2               6   
3   28        Sales Executive           4300                3               3   
4   45                Manager          11000                4              15   
5   25     Research Scientist           3500                1               2   
6   50                Manager          12000                4              20   
7   30        Sales Executive           5000                2               5   
8   37  Laboratory Technician           3100                2               9   
9   26     Research Scientist           4500                3               2   

   Attrition  
0          1  
1          0  
2          0  
3          1  
4          0  
5          1  
6  

# step 3: preprocess the data

In [None]:
# Identify categorical features (columns with string values)
categorical_features = [col for col in df.columns if df[col].dtype == 'object']
print(categorical_features)

# Create a OneHotEncoder object
encoder = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_features)],
                             remainder='passthrough')

print(encoder)
# Fit and transform the DataFrame
encoded_data = encoder.fit_transform(df)
print(encoded_data)

# Convert the encoded data back to a pandas DataFrame (optional)
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(df.columns))
print(encoded_df)

['JobRole']
ColumnTransformer(remainder='passthrough',
                  transformers=[('encoder', OneHotEncoder(), ['JobRole'])])
[[0.0e+00 0.0e+00 0.0e+00 1.0e+00 2.9e+01 4.8e+03 3.0e+00 4.0e+00 1.0e+00]
 [0.0e+00 0.0e+00 1.0e+00 0.0e+00 3.5e+01 6.0e+03 4.0e+00 8.0e+00 0.0e+00]
 [1.0e+00 0.0e+00 0.0e+00 0.0e+00 4.0e+01 3.4e+03 2.0e+00 6.0e+00 0.0e+00]
 [0.0e+00 0.0e+00 0.0e+00 1.0e+00 2.8e+01 4.3e+03 3.0e+00 3.0e+00 1.0e+00]
 [0.0e+00 1.0e+00 0.0e+00 0.0e+00 4.5e+01 1.1e+04 4.0e+00 1.5e+01 0.0e+00]
 [0.0e+00 0.0e+00 1.0e+00 0.0e+00 2.5e+01 3.5e+03 1.0e+00 2.0e+00 1.0e+00]
 [0.0e+00 1.0e+00 0.0e+00 0.0e+00 5.0e+01 1.2e+04 4.0e+00 2.0e+01 0.0e+00]
 [0.0e+00 0.0e+00 0.0e+00 1.0e+00 3.0e+01 5.0e+03 2.0e+00 5.0e+00 0.0e+00]
 [1.0e+00 0.0e+00 0.0e+00 0.0e+00 3.7e+01 3.1e+03 2.0e+00 9.0e+00 0.0e+00]
 [0.0e+00 0.0e+00 1.0e+00 0.0e+00 2.6e+01 4.5e+03 3.0e+00 2.0e+00 1.0e+00]]
   encoder__JobRole_Laboratory Technician  encoder__JobRole_Manager  \
0                                     0.0      

# step 4: inspect the data and segregate the input and output

In [None]:
inputx = encoded_df.drop('remainder__Attrition', axis=1)
outputy = encoded_df['remainder__Attrition']
print(inputx)
print(outputy)

   encoder__JobRole_Laboratory Technician  encoder__JobRole_Manager  \
0                                     0.0                       0.0   
1                                     0.0                       0.0   
2                                     1.0                       0.0   
3                                     0.0                       0.0   
4                                     0.0                       1.0   
5                                     0.0                       0.0   
6                                     0.0                       1.0   
7                                     0.0                       0.0   
8                                     1.0                       0.0   
9                                     0.0                       0.0   

   encoder__JobRole_Research Scientist  encoder__JobRole_Sales Executive  \
0                                  0.0                               1.0   
1                                  1.0                            

#step 5 : Split the data for training and testing in ratio 0.7/0.3

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(inputx, outputy, test_size=0.3, random_state=41)

# Standardize the features
scaler = StandardScaler()
#scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(X_test)

[[ 1.58113883 -0.40824829 -0.63245553 -0.63245553  0.42066707 -0.88435671
  -1.32287566  0.55337157]
 [-0.63245553 -0.40824829  1.58113883 -0.63245553  0.11070186  0.28138623
   1.32287566  0.31127151]
 [-0.63245553 -0.40824829  1.58113883 -0.63245553 -1.28414158 -0.32158426
   0.         -1.14132887]
 [ 1.58113883 -0.40824829 -0.63245553 -0.63245553  0.88561489 -0.76376262
  -1.32287566 -0.17292862]
 [-0.63245553  2.44948974 -0.63245553 -0.63245553  1.66052791  2.29128785
   1.32287566  2.00597195]
 [-0.63245553 -0.40824829 -0.63245553  1.58113883 -0.97417637 -0.40198032
   0.         -0.8992288 ]
 [-0.63245553 -0.40824829 -0.63245553  1.58113883 -0.81919377 -0.20099016
   0.         -0.65712874]]
[[-0.63245553 -0.40824829  1.58113883 -0.63245553 -1.43912419 -0.72356458
  -2.64575131 -1.14132887]
 [-0.63245553  2.44948974 -0.63245553 -0.63245553  2.43544094  2.69326817
   1.32287566  3.21647226]
 [-0.63245553 -0.40824829 -0.63245553  1.58113883 -0.66421116 -0.1205941
  -1.32287566 -0.

# step 6 : select the model

In [None]:
thismodel = KNeighborsClassifier(n_neighbors=3)

# step 7 : train the model

In [None]:
print("\nThe parameters of the model are\n\n",thismodel.get_params())
print(thismodel.fit(X_train,y_train))


The parameters of the model are

 {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
KNeighborsClassifier(n_neighbors=3)


# step 8: predict the test data

In [None]:
# Predict on the test set
y_pred = thismodel.predict(X_test)

# step 9: print the results

In [None]:
reslist=[]
for val in y_pred:
    if val==0:
        reslist.append("No")
    else:
        reslist.append("Yes")
print(reslist)

['Yes', 'No', 'Yes']


#step 10 : # Evaluate the model

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.50      0.67         2
         1.0       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

