In [1]:
#import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score,classification_report

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

In [2]:
from xgboost import XGBClassifier

In [3]:
#Load Data
train = pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\Promotion_prediction_data\train.csv")
test=pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\Promotion_prediction_data\test.csv")
submission_data=pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\Promotion_prediction_data\sample_submission.csv")

In [4]:
def rnn():
    #Checking the valuecounts of null columns before imputation
    print("col:Previous_year_rating",train['previous_year_rating'].value_counts())
    print("col:education",train['education'].value_counts())
    print("*********************************************************")
    #columns based on datatypes seperated
    ignore_col=['employee_id','is_promoted']
    target_col=['previous_year_rating']
    num_col=[]
    cat_col=[]
    for col in train.columns:
        if col not in ignore_col + target_col:
            if train[col].dtypes == 'object':
                cat_col.append(col)
            else:
                num_col.append(col)
                
    print("categorical columns",cat_col)
    print("numerical columns",num_col)
    print("*********************************************************")
    
    #categorical column imputation
    train['education']=train['education'].fillna(train['education'].mode().iloc[0])
    
    #Encoding Categorical columns and scaling numberical columns
    categorical_encoding = Pipeline(
        steps=[
            ('Encoding',OneHotEncoder(handle_unknown='ignore'))
        ])
    numerical_encoding = Pipeline(
        steps=[
            ('Encoding',StandardScaler())
        ])
    
    preprocess = ColumnTransformer(
        transformers=[
            ('category_encoding',categorical_encoding,cat_col),
            ('numerical_encoding',numerical_encoding,num_col)
        ])
    
    
    #numerical column imputation using knn
    notnull_datas = train[train['previous_year_rating'].notnull()]
    null_datas = train[train['previous_year_rating'].isna()]
    
    X_knn = notnull_datas.drop(columns=['is_promoted','previous_year_rating'])
    y_knn = notnull_datas['previous_year_rating']
    
    hmv_model = Pipeline(
        steps=[
            ('preprocess',preprocess),
            ('hmv_model',KNeighborsClassifier(n_neighbors=5))
        ]
    )
    
    hmv_model.fit(X_knn,y_knn)
    
    #predicting null values
    null_data=null_datas.drop(columns=['is_promoted','previous_year_rating'])
    pred = hmv_model.predict(null_data)
    
    #changing null values in actual dataset
    train.loc[train['previous_year_rating'].isna(),'previous_year_rating'] = pred
    
    #Checking the valuecounts of null columns after imputation
    print("col:Previous_year_rating",train['previous_year_rating'].value_counts())
    print("col:education",train['education'].value_counts())



col:Previous_year_rating previous_year_rating
3.0    18618
5.0    11741
4.0     9877
1.0     6223
2.0     4225
Name: count, dtype: int64
col:education education
Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: count, dtype: int64
*********************************************************
categorical columns ['department', 'region', 'education', 'gender', 'recruitment_channel']
numerical columns ['no_of_trainings', 'age', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score']
*********************************************************
col:Previous_year_rating previous_year_rating
3.0    20585
5.0    12349
4.0    10525
1.0     6888
2.0     4461
Name: count, dtype: int64
col:education education
Bachelor's          39078
Master's & above    14925
Below Secondary       805
Name: count, dtype: int64


In [5]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
