In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [10]:
import pandas as pd
cols = ["Age", "Gender", "Education Level", "Experience Years", "Previous Companies", "Distance From Company", "Interview Score", "Skill Score", "Personality Score", "Recruitment Strategy", "Hiring Decision"]
file_path = 'Recruit.csv'  
df = pd.read_csv(file_path, names=cols)
print(df.head())


   Age  Gender  Education Level  Experience Years  Previous Companies  \
0   26       1                2                 0                   3   
1   39       1                4                12                   3   
2   48       0                2                 3                   2   
3   34       1                2                 5                   2   
4   30       0                1                 6                   1   

   Distance From Company  Interview Score  Skill Score  Personality Score  \
0              26.783828               48           78                 91   
1              25.862694               35           68                 80   
2               9.920805               20           67                 13   
3               6.407751               36           27                 70   
4              43.105343               23           52                 85   

   Recruitment Strategy  Hiring Decision  
0                     1                1  
1           

In [11]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(df)):
    train= df.iloc[train_index]
    test = df.iloc[test_index]
    
    train_counts = train['Hiring Decision'].value_counts()
    hired_train = train_counts.get(1, 0)  # Number of '1' is hired
    not_hired_train = train_counts.get(0, 0)  # Number of '0' is not hired
    
    print(f"Fold {fold+1}:")
    print(f"Training data: {train.shape}, Hired (1): {hired_train}, Not Hired (0): {not_hired_train}")
    print(f"Test data: {test.shape}")


Fold 1:
Training data: (1350, 11), Hired (1): 427, Not Hired (0): 923
Test data: (150, 11)
Fold 2:
Training data: (1350, 11), Hired (1): 418, Not Hired (0): 932
Test data: (150, 11)
Fold 3:
Training data: (1350, 11), Hired (1): 421, Not Hired (0): 929
Test data: (150, 11)
Fold 4:
Training data: (1350, 11), Hired (1): 416, Not Hired (0): 934
Test data: (150, 11)
Fold 5:
Training data: (1350, 11), Hired (1): 415, Not Hired (0): 935
Test data: (150, 11)
Fold 6:
Training data: (1350, 11), Hired (1): 425, Not Hired (0): 925
Test data: (150, 11)
Fold 7:
Training data: (1350, 11), Hired (1): 423, Not Hired (0): 927
Test data: (150, 11)
Fold 8:
Training data: (1350, 11), Hired (1): 416, Not Hired (0): 934
Test data: (150, 11)
Fold 9:
Training data: (1350, 11), Hired (1): 415, Not Hired (0): 935
Test data: (150, 11)
Fold 10:
Training data: (1350, 11), Hired (1): 409, Not Hired (0): 941
Test data: (150, 11)


In [12]:
def scale(dataframe, oversample=False):
    x=dataframe[dataframe.columns[:-1]].values
    y=dataframe[dataframe.columns[-1]].values
    
    scaler= StandardScaler()
    x= scaler.fit_transform(x)
    
    if oversample:
        smote = SMOTE(random_state=42)
        x, y = smote.fit_resample(x, y)
        
    data = np.hstack((x,np.reshape(y,(-1,1))))
    
    return data,x,y

In [13]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(df)):
    train = df.iloc[train_index]
    test = df.iloc[test_index]
    
    train_data, x_train, y_train = scale(train, oversample=True)
    test_data, x_test, y_test = scale(test, oversample=False)
    
    print(f"Fold {fold + 1}:")
    print(f"Training data: {train.shape}, Hired (1): {np.sum(y_train == 1)}, Not Hired (0): {np.sum(y_train == 0)}")
    print(f"Test data: {test.shape}, Hired (1): {np.sum(y_test == 1)}, Not Hired (0): {np.sum(y_test == 0)}")
    print("------------------------------------------------------")

Fold 1:
Training data: (1350, 11), Hired (1): 923, Not Hired (0): 923
Test data: (150, 11), Hired (1): 38, Not Hired (0): 112
------------------------------------------------------
Fold 2:
Training data: (1350, 11), Hired (1): 932, Not Hired (0): 932
Test data: (150, 11), Hired (1): 47, Not Hired (0): 103
------------------------------------------------------
Fold 3:
Training data: (1350, 11), Hired (1): 929, Not Hired (0): 929
Test data: (150, 11), Hired (1): 44, Not Hired (0): 106
------------------------------------------------------
Fold 4:
Training data: (1350, 11), Hired (1): 934, Not Hired (0): 934
Test data: (150, 11), Hired (1): 49, Not Hired (0): 101
------------------------------------------------------
Fold 5:
Training data: (1350, 11), Hired (1): 935, Not Hired (0): 935
Test data: (150, 11), Hired (1): 50, Not Hired (0): 100
------------------------------------------------------
Fold 6:
Training data: (1350, 11), Hired (1): 925, Not Hired (0): 925
Test data: (150, 11), Hir

In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier  
import pandas as pd
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(df)):
    train = df.iloc[train_index]
    test = df.iloc[test_index]
    
    scaler = StandardScaler()
    x_train = scaler.fit_transform(train[train.columns[:-1]].values)
    y_train = train['Hiring Decision'].values
    
    smote = SMOTE(random_state=42)
    x_train, y_train = smote.fit_resample(x_train, y_train)
    
    x_test = scaler.transform(test[test.columns[:-1]].values)
    y_test = test['Hiring Decision'].values
    
    lgb_model = LGBMClassifier(random_state=42)
    lgb_model.fit(x_train, y_train)
    
    y_pred_lgb = lgb_model.predict(x_test)
    
    print(f"Fold {fold + 1}:")
    print("Classification Report:\n", classification_report(y_test, y_pred_lgb))
    print("------------------------------------------------------")


Fold 1:
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       112
           1       0.87      0.87      0.87        38

    accuracy                           0.93       150
   macro avg       0.91      0.91      0.91       150
weighted avg       0.93      0.93      0.93       150

------------------------------------------------------
Fold 2:
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       103
           1       0.87      0.87      0.87        47

    accuracy                           0.92       150
   macro avg       0.91      0.91      0.91       150
weighted avg       0.92      0.92      0.92       150

------------------------------------------------------
Fold 3:
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92       106
           1       0.84      0.