In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE  # For handling imbalance

# Load the data
train_data = pd.read_csv('/content/train_LZdllcl.csv')
test_data = pd.read_csv('/content/test_2umaH9m.csv')


In [None]:
# Check missing values
print(train_data.isnull().sum())

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64


In [None]:
#fill missing values
train_data['education'].fillna(train_data['education'].mode()[0], inplace=True)
train_data['previous_year_rating'].fillna(train_data['previous_year_rating'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['education'].fillna(train_data['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['previous_year_rating'].fillna(train_data['previous_year_rating'].median(), inplace=True)


In [None]:
#Encoding categorical variables
train_data = pd.get_dummies(train_data, columns=['department', 'region', 'education', 'gender', 'recruitment_channel'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['department', 'region', 'education', 'gender', 'recruitment_channel'], drop_first=True)

In [None]:
#seperate features and target
X = train_data.drop(['employee_id', 'is_promoted'], axis=1)
y = train_data['is_promoted']

# For the test set, we don't have the target, so we'll just use the features
X_test = test_data.drop(['employee_id'], axis=1)


In [None]:
#Handle imbalanced data
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(X, y)

In [None]:
#split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

In [None]:
#Train a model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
#Evaluate the model
y_pred = model.predict(X_val)
f1 = f1_score(y_val, y_pred)
print("Validation F1 Score:", f1)

Validation F1 Score: 0.9632001626512148


In [None]:
#predict on the test set
predictions = model.predict(X_test)

In [None]:
#prepare submission
submission = pd.DataFrame({
    'employee_id': test_data['employee_id'],
    'is_promoted': predictions
})

submission.to_csv('submission.csv', index=False)
