In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
test_df = pd.read_csv(r"C:\Users\hp\Downloads\test_2umaH9m.csv")
train_df = pd.read_csv(r"C:\Users\hp\Downloads\train_LZdllcl.csv")
submission_df = pd.read_csv(r"C:\Users\hp\Downloads\sample_submission_M0L0uXE.csv")

In [3]:
# Display initial information on train and test datasets
train_info = train_df.info(), train_df.describe(), train_df.head()
test_info = test_df.info(), test_df.describe(), test_df.head()

train_info, test_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52400 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB
<class 'panda

((None,
          employee_id  no_of_trainings           age  previous_year_rating  \
  count  54808.000000     54808.000000  54808.000000          50684.000000   
  mean   39195.830627         1.253011     34.803915              3.329256   
  std    22586.581449         0.609264      7.660169              1.259993   
  min        1.000000         1.000000     20.000000              1.000000   
  25%    19669.750000         1.000000     29.000000              3.000000   
  50%    39225.500000         1.000000     33.000000              3.000000   
  75%    58730.500000         1.000000     39.000000              4.000000   
  max    78298.000000        10.000000     60.000000              5.000000   
  
         length_of_service  KPIs_met >80%   awards_won?  avg_training_score  \
  count       54808.000000   54808.000000  54808.000000        54808.000000   
  mean            5.865512       0.351974      0.023172           63.386750   
  std             4.265094       0.477590      0.1

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np



In [5]:
# Separate features and target in training data
X_train = train_df.drop(['employee_id', 'is_promoted'], axis=1)
y_train = train_df['is_promoted']

In [6]:
# Drop 'employee_id' column in test data as well
X_test = test_df.drop(['employee_id'], axis=1)

In [7]:
# Define preprocessing functions
def preprocess_data(df, is_train=True):
    # Handling missing values
    imputer_mode = SimpleImputer(strategy='most_frequent')
    imputer_mean = SimpleImputer(strategy='mean')
    
    # Fill categorical 'education' and numeric 'previous_year_rating'
    df['education'] = imputer_mode.fit_transform(df[['education']])
    df['previous_year_rating'] = imputer_mean.fit_transform(df[['previous_year_rating']])
    
    # Encoding categorical variables
    for col in ['department', 'region', 'education', 'gender', 'recruitment_channel']:
        df[col] = LabelEncoder().fit_transform(df[col])
        
    return df

In [8]:
# Apply preprocessing to train and test datasets
X_train_processed = preprocess_data(X_train)
X_test_processed = preprocess_data(X_test, is_train=False)

# Scale numeric features
scaler = StandardScaler()
numeric_cols = ['age', 'length_of_service', 'avg_training_score', 'no_of_trainings', 'previous_year_rating']
X_train_processed[numeric_cols] = scaler.fit_transform(X_train_processed[numeric_cols])
X_test_processed[numeric_cols] = scaler.transform(X_test_processed[numeric_cols])

# Show processed data head for verification
X_train_processed.head(), X_test_processed.head()

(   department  region  education  gender  recruitment_channel  \
 0           7      31          2       0                    2   
 1           4      14          0       1                    0   
 2           7      10          0       1                    2   
 3           7      15          0       1                    0   
 4           8      18          0       1                    0   
 
    no_of_trainings       age  previous_year_rating  length_of_service  \
 0        -0.415276  0.025598              1.378900           0.500460   
 1        -0.415276 -0.627135              1.378900          -0.437395   
 2        -0.415276 -0.104948             -0.271742           0.265996   
 3         1.226063  0.547785             -1.922383           0.969387   
 4        -0.415276  1.331064             -0.271742          -0.906322   
 
    KPIs_met >80%  awards_won?  avg_training_score  
 0              1            0           -1.075931  
 1              0            0           -0.253282

In [9]:
# Split the training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_processed, y_train, test_size=0.2, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Define and train a Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_split, y_train_split)

RandomForestClassifier(random_state=42)

In [11]:
# Predict on validation set and evaluate
y_val_pred_rf = rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, y_val_pred_rf)
rf_f1 = f1_score(y_val, y_val_pred_rf)

print("Decision Tree Validation Accuracy:", rf_accuracy)
print("Decision Tree Validation F1 Score:", rf_f1)

Decision Tree Validation Accuracy: 0.9358693669038497
Decision Tree Validation F1 Score: 0.42235004108463436


In [12]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_split, y_train_split)

# Predict on validation set and evaluate
y_val_pred_dt = dt_model.predict(X_val)
dt_accuracy = accuracy_score(y_val, y_val_pred_dt)
dt_f1 = f1_score(y_val, y_val_pred_dt)

print("Decision Tree Validation Accuracy:", dt_accuracy)
print("Decision Tree Validation F1 Score:", dt_f1)

Decision Tree Validation Accuracy: 0.8968253968253969
Decision Tree Validation F1 Score: 0.4214833759590792


In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)  # Start with n_neighbors=5 as a baseline
knn_model.fit(X_train_split, y_train_split)

# Predict on validation set and evaluate
y_val_pred_knn = knn_model.predict(X_val)
knn_accuracy = accuracy_score(y_val, y_val_pred_knn)
knn_f1 = f1_score(y_val, y_val_pred_knn)

print("KNN Validation Accuracy:", knn_accuracy)
print("KNN Validation F1 Score:", knn_f1)


KNN Validation Accuracy: 0.9175332968436417
KNN Validation F1 Score: 0.13904761904761903


#### Random Forest gives more accuracy than other 2

In [14]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]
}

# Initialize the GridSearchCV with F1 scoring to address imbalance
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_split, y_train_split)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'max_depth': [10, 20, None],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200]},
             scoring='f1', verbose=1)

In [15]:
# Retrieve best model
best_rf_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


In [16]:
# Evaluate best model on validation set
y_val_pred_best = best_rf_model.predict(X_val)
best_accuracy = accuracy_score(y_val, y_val_pred_best)
best_f1 = f1_score(y_val, y_val_pred_best)
print("Best Validation Accuracy:", best_accuracy)
print("Best Validation F1 Score:", best_f1)

Best Validation Accuracy: 0.9190841087392811
Best Validation F1 Score: 0.4554941682013505


In [17]:
# Predict on the test dataset
y_test_pred = best_rf_model.predict(X_test_processed)

# Load the sample submission file and replace the target column with predictions
submission_df['is_promoted'] = y_test_pred

In [20]:
# Save the predictions to a new CSV file
submission_df.to_csv("C:/Users/hp/Downloads/submission_predictions.csv", index=False)
print("Predictions saved to submission_predictions.csv")

Predictions saved to submission_predictions.csv
