In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score,classification_report,accuracy_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

In [3]:
from xgboost import XGBClassifier

In [5]:
#Load Data
train = pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\Promotion_prediction_data\train.csv")
test=pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\Promotion_prediction_data\test.csv")
submission_data=pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\Promotion_prediction_data\sample_submission.csv")

In [7]:
#columns based on datatypes seperated
ignore_col=['employee_id']
target_col=['is_promoted']
num_col=[]
cat_col=[]
for col in train.columns:
    if col not in ignore_col + target_col:
        if train[col].dtypes == 'object':
            cat_col.append(col)
        else:
            num_col.append(col)

In [9]:
#categorical and numerical columns imputed and encoded
#categorical columns imputed by mode and encoded by onehotencoder
#numerical columns imputed by median and encoded by standardscalar
cat_pipe_encode=Pipeline(
    steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])
num_pip_encode=Pipeline(
    steps=[
    ('impute',SimpleImputer(strategy='median')),
    ('standardscalar',StandardScaler())
])
## map tranformation to features
preprocess=ColumnTransformer(
    transformers=[
        ('cat_encode',cat_pipe_encode,cat_col),
        ('num_encode',num_pip_encode,num_col)
    ]
)

In [11]:
X=train.drop(columns=target_col+ignore_col)
y=train[target_col]

In [13]:
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=42, test_size=0.3)

In [15]:
def evalution(train_X,train_y,val_X,val_y,model):
    predicted_train_tgt = model.predict(train_X)
    predicted_val_tgt = model.predict(val_X)
    
    # print("accuracy score train",accuracy_score(train_y,predicted_train_tgt))
    # print("accuracy score test",accuracy_score(val_y,predicted_val_tgt))
    
    print("f1score train",f1_score(train_y,predicted_train_tgt))
    print("f1score test",f1_score(val_y,predicted_val_tgt))
    
    # print("classification train \n",classification_report(train_y,predicted_train_tgt))
    # print("classification test \n",classification_report(val_y,predicted_val_tgt))

In [117]:
model = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.4,
            gamma=0.111,
            learning_rate=0.4555,
            max_depth=2,
            n_estimators=950,
            subsample=0.6
        ))
    ]
)

model.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model)

f1score train 0.535850657615613
f1score test 0.5005302226935313


In [118]:
submission_data['is_promoted']=model.predict(test)
submission_data.to_csv('model.csv', index=False)

In [None]:
Your Score is 0.4991119005328597
Congratulations you are placed #904 on the leader board

model = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0,
            learning_rate=0.4555,
            max_depth=2,
            n_estimators=950,
            subsample=0.6
        ))
    ]
)

model.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model)


In [147]:
over_sampling=RandomOverSampler()
train_X_over_sampled,train_y_over_sampled=over_sampling.fit_resample(train_X,train_y)
over_balanced_data=pd.concat([train_X_over_sampled,train_y_over_sampled] , axis =1)
print(over_balanced_data['is_promoted'].value_counts())
over_balanced_data.head()

# train_X, val_X, train_y, val_y = train_test_split(train_X_over_sampled,train_y_over_sampled,random_state=42, test_size=0.3)

# model.fit(train_X_over_sampled, train_y_over_sampled)
# evalution(train_X_over_sampled, train_y_over_sampled, val_X, val_y, model)

# submission_data['is_promoted']=model.predict(test)
# submission_data.to_csv('model.csv', index=False)

is_promoted
0    8481
1    8481
Name: count, dtype: int64


Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Finance,region_15,Bachelor's,m,sourcing,1,37,5.0,9,1,0,62,0
1,Sales & Marketing,region_23,Bachelor's,m,other,1,26,5.0,2,0,0,59,1
2,Operations,region_2,Below Secondary,m,sourcing,3,25,5.0,2,1,0,61,0
3,Procurement,region_22,Master's & above,m,other,1,31,5.0,5,1,0,67,1
4,Sales & Marketing,region_22,Bachelor's,m,other,2,25,,1,0,0,50,0


In [153]:
under_sampling=RandomUnderSampler()
train_X_under_sampled,train_y_under_sampled=under_sampling.fit_resample(train_X,train_y)
under_balanced_data=pd.concat([train_X_under_sampled,train_y_under_sampled] , axis =1)
print(under_balanced_data['is_promoted'].value_counts())
under_balanced_data.head()

train_X, val_X, train_y, val_y = train_test_split(train_X_under_sampled,train_y_under_sampled,random_state=42, test_size=0.3)

model.fit(train_X_under_sampled, train_y_under_sampled)
evalution(train_X_under_sampled, train_y_under_sampled, val_X, val_y, model)

submission_data['is_promoted']=model.predict(test)
submission_data.to_csv('model.csv', index=False)

is_promoted
0    5863
1    5863
Name: count, dtype: int64
f1score train 0.8874465987512323
f1score test 0.8773796192609182
