#### IMPORT PYTHON & SKLEARN PACKAGES

In [1]:
# Import Python basic packages 
import os
import numpy as np
import pandas as pd 
import boto
import math

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import RFECV

from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

#### USER INPUT 

In [2]:
# Path for training dataset
train_csv_path = 'C:/Python/ClientJob/train.csv'
# Path for testing dataset
test_csv_path = 'C:/Python/ClientJob/test.csv'

# Path for pushing output as csv 
train_out_csv = 'C:/Python/ClientJob/train_predicted.csv'
test_out_csv = 'C:/Python/ClientJob/test_predicted.csv'

#### FEATURE MAPPING DICTIONARY 

##### Standardize mapping dictionary useful to convert categorical variables into dummy variables 

In [3]:
dict_cntryid = {'Italy':'00', 'Japan':'01', 'Russian Federation':'02', 'Germany':'03', 'United States':'04','France':'05', 'Spain':'06', 'Korea':'07', 'Canada':'08',
                'Slovak Republic':'09', 'United Kingdom':'10', 'Netherlands':'11', 'Greece':'12', 'Turkey':'13', 'Poland':'14', 'Finland':'15', 'Slovenia':'16','Israel':'17',
                'Belgium':'18', 'New Zealand':'19', 'Lithuania':'20', 'Norway':'21', 'Chile':'22', 'Sweden':'23', 'Denmark':'24', 'Singapore':'25', 
                'Czech Republic':'26', 'Estonia':'27','Austria':'28', 'Ireland':'29', 'Cyprus':'30'}

dict_ctryrgn = {'North America and Western Europe':'00', 'East Asia and the Pacific (richer countries)':'01', 'Central and Eastern Europe':'02', 
                'Latin America and the Caribbean':'03','Oceania':'04'}

dict_edlevel3 = {'Low':'00', 'Medium':'01', 'High':'02'}

dict_gender_r ={'Male':0,'Female':1}

dict_yesno = {'No':0, 'Yes':1}

dict_incpr ={'Less than 10':'00', '10 to less than 25':'01', '25 to less than 50':'02', '50 to less than 75':'03', '75 to less than 90':'04', '90 or more':'05'}

dict_nativelang = {'Test language same as native language':0, 'Test language not same as native language':1}

dict_iscoskil4 = {'Semi-skilled blue-collar occupations':'00','Skilled occupations':'01', 'Semi-skilled white-collar occupations':'02',
                  'Elementary occupations':'03'}

dict_ageg5 = {'Aged 16-19':'00', 'Aged 20-24':'01', 'Aged 25-29':'02', 'Aged 30-34':'03', 'Aged 35-39':'04', 'Aged 40-44':'05', 'Aged 45-49':'06', 
              'Aged 50-54':'07', 'Aged 55-59':'08', 'Aged 60-65':'09'}

dict_ageg10 = {'24 or less':'00', '25-34':'01', '35-44':'02', '45-54':'03', '55 plus':'04'}

dict_neet = {'Employed or participated in education or training in last 12 months':1, 
             'Not currently employed and did not participate in education or training in last 12 months (NEET)':0}

dict_wle_ca = {'All zero response':'00', 'Lowest to 20%':'01','More than 20% to 40%':'02', 'More than 40% to 60%':'03', 'More than 60% to 80%':'04','More than 80%':'05'}

dict_v31 = {'Teacher training and education science':'00', 'Social sciences, business and law':'01', 'Services':'02','Science, mathematics and computing':'03', 
            'General programmes':'04', 'Engineering, manufacturing and construction':'05', 'Humanities, languages and arts':'06', 'Health and welfare':'07', 
            'Agriculture and veterinary':'08'}

dict_earnbnsdcl = {'Lowest decile':'00', '2nd decile':'01', '3rd decile':'02', '4th decile':'03', '5th decile':'04', '6th decile':'05', '7th decile':'06', 
                   '8th decile':'07', '9th decile':'08', 'Highest decile':'09'}

dict_v151 = {'Aged 15 or younger':'00',  'Aged 16-19':'01',  'Aged 20-24':'02', 'Aged 25-29':'03',   'Aged 30-34':'04', 'Aged 35 or older':'05'}

dict_edwork = {'In work only':'00', 'Not in education or work and has not participated in education or training in last 12 months (NEET)':'01', 'In education only':'02', 'Not in education or work but has participated in education or training in last 12 months':'03', 'In education and work':'04'}

dict_learnsrvy = {'Never':'00','Less than once a month':'01','Less than once a week but at least once a month':'02', 'At least once a week but not every day':'03',
                  'Every day':'04'}

dict_satfsrvy = {'Extremely dissatisfied':'00','Dissatisfied':'01','Neither satisfied nor dissatisfied':'02','Satisfied':'03','Extremely satisfied':'04'}


#### DATA TRANSFORMATION LIST 

In [4]:
#Variables 
base_data_col = ['job_performance','cntryid','ctryrgn','edwork','edlevel3','gender_r','computerexperience','nativespeaker','nativelang','yearlyincpr','iscoskil4','ageg5lfs',
                'ageg10lfs','neet','readytolearn_wle_ca','planning_wle_ca','readhome_wle_ca','readwork_wle_ca','writhome_wle_ca','writwork_wle_ca','taskdisc_wle_ca',
                'learnatwork_wle_ca','ictwork_wle_ca','icthome_wle_ca','influence_wle_ca','v31','earnhrbonusdcl',
                 'v151','v235','v246','v214','v276','v181','v60','v90','v157','v74']

#Imputing Cat variables 
cat_var_lst = ['cntryid','ctryrgn','edwork','edlevel3','gender_r','computerexperience','nativespeaker','nativelang','yearlyincpr','iscoskil4','ageg5lfs','ageg10lfs','neet',
            'readytolearn_wle_ca','planning_wle_ca','readhome_wle_ca','readwork_wle_ca','writhome_wle_ca','writwork_wle_ca', 'taskdisc_wle_ca', 'learnatwork_wle_ca',
               'ictwork_wle_ca', 'icthome_wle_ca','influence_wle_ca','v31','earnhrbonusdcl','v151','v235','v246','v214','v276','v181','v60','v90','v157','v74']

num_imp_lst = ['v235']

dummy_var_lst = ['cntryid', 'ctryrgn', 'edwork','edlevel3','yearlyincpr','iscoskil4','ageg10lfs','ageg5lfs', 'readytolearn_wle_ca','planning_wle_ca',
                 'readhome_wle_ca',
                'readwork_wle_ca','writhome_wle_ca','writwork_wle_ca','taskdisc_wle_ca', 'learnatwork_wle_ca','ictwork_wle_ca','icthome_wle_ca','influence_wle_ca',
                 'earnhrbonusdcl', 'v31','v151','v246','v214','v276','v181']

#### DATA TRANSFORMATION FUNCTION & MODEL SCORES

In [5]:
# Function to ReadCsv File
def readCsvFile(csv_file_path):
  # Read csv file 
  df = pd.read_csv(csv_file_path, low_memory=False)
  # Cleaning Country Gen col
  df['ctryrgn'] = np.where(df['cntryid']=='New Zealand', 'Oceania', df['ctryrgn'])
  print(df.shape)
  return df

# Function to Impute categorical variables:Fill missing values with the most frequent value
def imputeCatVar(impute_df, cat_var_lst):
  for col in cat_var_lst:
    impute_df[col].fillna(impute_df[col].value_counts().idxmax(), inplace=True)
  return impute_df

# Function to Impute numerical variables :Fill missing values with the most frequent value
def imputerNumMedian(df, num_imp_lst):
  for col in num_imp_lst:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value).astype(int)
  return df

# Function to standardize numerical variables using sklearn MinMaxScaler
def numScaling(df, num_imp_lst):
  data_df = df.copy()
  scale_df = data_df[num_imp_lst]
  data_df = data_df.drop(data_df[num_imp_lst],axis=1)
  numeric_scaler = MinMaxScaler(feature_range=(0,1), copy=True)
  numeric_scaler.fit(scale_df)
  scale_df = pd.DataFrame(numeric_scaler.transform(scale_df), index=scale_df.index, columns=scale_df.columns)
  data_df = pd.concat([data_df, scale_df], axis=1)
  return data_df

# Function to create dummy variables for all categorical variables 
def dummyCatFeature(df, dummy_var_lst):
  data_df = df.copy()
  for col in dummy_var_lst:
    dummy_df = pd.get_dummies(data_df[col], prefix=col)
    data_df = pd.concat([data_df, dummy_df], axis=1)
  return data_df

# Function created to MapValues from Dctionary defination 
def mapDictValue(map_df):
  # Mapping Sring column
  map_df['cntryid'] = map_df['cntryid'].map(dict_cntryid).astype(str) 
  map_df['ctryrgn'] = map_df['ctryrgn'].map(dict_ctryrgn).astype(str)   
  map_df['iscoskil4'] = map_df['iscoskil4'].map(dict_iscoskil4).astype(str) 
  map_df['edwork'] = map_df['edwork'].map(dict_edwork).astype(str)    
  map_df['yearlyincpr'] = map_df['yearlyincpr'].map(dict_incpr).astype(str)
  map_df['nativelang'] = map_df['nativelang'].map(dict_nativelang).astype(str)
  map_df['ageg10lfs'] = map_df['ageg10lfs'].map(dict_ageg10).astype(str)
  map_df['ageg5lfs'] = map_df['ageg5lfs'].map(dict_ageg5).astype(str)
  map_df['readytolearn_wle_ca'] = map_df['readytolearn_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['planning_wle_ca'] = map_df['planning_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['readhome_wle_ca'] = map_df['readhome_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['readwork_wle_ca'] = map_df['readwork_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['writhome_wle_ca'] = map_df['writhome_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['writwork_wle_ca'] = map_df['writwork_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['taskdisc_wle_ca'] = map_df['taskdisc_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['learnatwork_wle_ca'] = map_df['learnatwork_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['ictwork_wle_ca'] = map_df['ictwork_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['icthome_wle_ca'] = map_df['icthome_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['influence_wle_ca'] = map_df['influence_wle_ca'].map(dict_wle_ca).astype(str)
  map_df['earnhrbonusdcl'] = map_df['earnhrbonusdcl'].map(dict_earnbnsdcl).astype(str)
  map_df['v31'] = map_df['v31'].map(dict_v31).astype(str)
  map_df['v151'] = map_df['v151'].map(dict_v151).astype(str)  
  map_df['v246'] = map_df['v246'].map(dict_learnsrvy).astype(str)
  map_df['v214'] = map_df['v214'].map(dict_learnsrvy).astype(str)
  map_df['v276'] = map_df['v276'].map(dict_learnsrvy).astype(str)
  map_df['v181'] = map_df['v181'].map(dict_satfsrvy).astype(str)
  map_df['edlevel3'] = map_df['edlevel3'].map(dict_edlevel3).astype(str)
  
  # Mapping Integer column    
  map_df['gender_r'] = map_df['gender_r'].map(dict_gender_r).astype(int)
  map_df['computerexperience'] = map_df['computerexperience'].map(dict_yesno).astype(int)    
  map_df['nativespeaker'] = map_df['nativespeaker'].map(dict_yesno).astype(int)        
  map_df['neet'] = map_df['neet'].map(dict_neet).astype(int)
  map_df['v60'] = map_df['v60'].map(dict_yesno).astype(int)  
  map_df['v90'] = map_df['v90'].map(dict_yesno).astype(int)    
  map_df['v157'] = map_df['v157'].map(dict_yesno).astype(int)  
  map_df['v74'] = map_df['v74'].map(dict_yesno).astype(int)      
  
  return map_df
  
# Function to display Score, Mean Score and Standard Deviation
def display_scores(scores):   
  scores = np.array(scores).astype(float).round(decimals=3)
  print("Scores: {}".format(scores))
  print("Mean: {:.3f}".format(scores.mean()))
  print("Standard deviation: {:.4f}".format(scores.std()))

#### FEATURES FOR CHAMPION MODEL  
Feature Selection: Using multiple feature selection procedures such as SelectKBest and Recursive Feature Elimination 

'edlevel', 'ageg5lf', 'cntryid', 'computerexperience', 'ctryrgn', 'earnhrbonusdcl', 'edwork', 'gender_r', 'icthome_wle_ca', 'ictwork_wle_ca', 'influence_wle', 'iscoskil', 'learnatwork_wle_ca', 'nativespeaker', 'neet', 'planning_wle_ca', 'readhome_wle_ca', 'readwork_wle_ca', 'readytolearn_wle_ca', 'taskdisc_wle_ca', 'v151', 'v157', 'v181', 'v214', 'v235', 'v246',
'v276', 'v74', 'v90', 'writhome_wle_ca', 'writwork_wle_ca', 'yearlyincpr'

In [8]:
rmfrst_champ_feature_var = ['ageg5lfs_01','ageg5lfs_02','ageg5lfs_03','ageg5lfs_04','ageg5lfs_05','ageg5lfs_06','ageg5lfs_07','ageg5lfs_08','ageg5lfs_09',
                    'cntryid_01','cntryid_02','cntryid_03','cntryid_04','cntryid_05','cntryid_06','cntryid_07','cntryid_08','cntryid_09','cntryid_10','cntryid_11',
                    'cntryid_12','cntryid_13','cntryid_14','cntryid_15','cntryid_17','cntryid_18','cntryid_19','cntryid_21','cntryid_22','cntryid_23','cntryid_24',
                    'cntryid_25','cntryid_26','cntryid_27','cntryid_28','cntryid_29',
                    'computerexperience',
                    'ctryrgn_01','ctryrgn_02','ctryrgn_03','ctryrgn_04',
                    'earnhrbonusdcl_01','earnhrbonusdcl_02','earnhrbonusdcl_03','earnhrbonusdcl_04','earnhrbonusdcl_05','earnhrbonusdcl_06','earnhrbonusdcl_07',
                    'earnhrbonusdcl_08','earnhrbonusdcl_09',
                    'edlevel3','edwork_01','edwork_02','edwork_03','edwork_04',
                    'gender_r',
                    'icthome_wle_ca_01','icthome_wle_ca_02','icthome_wle_ca_03','icthome_wle_ca_04','icthome_wle_ca_05',
                    'ictwork_wle_ca_01','ictwork_wle_ca_02','ictwork_wle_ca_03','ictwork_wle_ca_04','ictwork_wle_ca_05',
                    'influence_wle_ca_01','influence_wle_ca_02','influence_wle_ca_03','influence_wle_ca_04','influence_wle_ca_05',
                    'iscoskil4_01','iscoskil4_02','iscoskil4_03',
                    'learnatwork_wle_ca_01','learnatwork_wle_ca_02','learnatwork_wle_ca_03','learnatwork_wle_ca_04','learnatwork_wle_ca_05',
                    'nativespeaker',
                    'neet',
                    'planning_wle_ca_01','planning_wle_ca_02','planning_wle_ca_03','planning_wle_ca_04','planning_wle_ca_05',
                    'readhome_wle_ca_01','readhome_wle_ca_02','readhome_wle_ca_03','readhome_wle_ca_04','readhome_wle_ca_05',
                    'readwork_wle_ca_01','readwork_wle_ca_02','readwork_wle_ca_03','readwork_wle_ca_04','readwork_wle_ca_05',
                    'readytolearn_wle_ca_01','readytolearn_wle_ca_02','readytolearn_wle_ca_03','readytolearn_wle_ca_04','readytolearn_wle_ca_05',
                    'taskdisc_wle_ca_01','taskdisc_wle_ca_02','taskdisc_wle_ca_03','taskdisc_wle_ca_04','taskdisc_wle_ca_05',
                    'v151_01','v151_02','v151_03','v151_04','v151_05',
                    'v157',
                    'v181_01','v181_02','v181_03','v181_04',
                    'v214_01','v214_02','v214_03','v214_04',
                    'v235',
                    'v246_01','v246_02','v246_03','v246_04',
                    'v276_01','v276_02','v276_03','v276_04',
                    'v60',
                    'v74',
                    'v90',
                    'writhome_wle_ca_01','writhome_wle_ca_02','writhome_wle_ca_03','writhome_wle_ca_04','writhome_wle_ca_05',
                    'writwork_wle_ca_01','writwork_wle_ca_02','writwork_wle_ca_03','writwork_wle_ca_04','writwork_wle_ca_05',
                    'yearlyincpr_01','yearlyincpr_02','yearlyincpr_03','yearlyincpr_04','yearlyincpr_05']

#### PREPARE TRAIN & TEST DATA FOR MODEL

In [12]:
# Read Csv file 
df_train = readCsvFile(train_csv_path)
#Dropping columns with missing value rate higher than threshold
df_thresh = df_train[df_train.columns[df_train.isnull().mean() < 0.7]].copy()
#Keeping the required columns 
df_filterd = df_thresh[base_data_col].copy()
# Impute categorical variables with the Mod 
df_cat_imp = imputeCatVar(df_filterd, cat_var_lst)
# Impute numerical variables with the mdeian value 
df_train_imp = imputerNumMedian(df_cat_imp, num_imp_lst)
# Scaling numerical variables 
df_train_scl = numScaling(df_train_imp, num_imp_lst)
# Mapping categorical and numerical variables
df_train_map = mapDictValue(df_train_scl)
# Create dummy indicator on categorical variables 
df_train_dmy = dummyCatFeature(df_train_map, dummy_var_lst)
# Cleaned data for modelling
cln_train_df = df_train_dmy.copy()

# Scaling response variables as scores will be capped at 5000  
#cln_train_df['job_performance'] = cln_train_df.apply(lambda row: math.log(((row.job_performance/5000)/(1-(row.job_performance/5000)))), axis = 1) 

# Stratified Shuffle Split on country and gender to create train and test data for modelling
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(cln_train_df, cln_train_df[['cntryid','gender_r']]):
    strat_train_set = cln_train_df.loc[train_index]
    strat_test_set = cln_train_df.loc[test_index]
    
# Full train dataframe 
X_full_train_df = cln_train_df[rmfrst_champ_feature_var]
y_full_train_df = cln_train_df['job_performance'].astype(int)

# Convert Full train dataframe to array
X_full_train = X_full_train_df.values
y_full_train = y_full_train_df.values.ravel().astype(int)

# Stratified Shuffled train dataframe  
X_train_df = strat_train_set[rmfrst_champ_feature_var]
y_train_df = strat_train_set['job_performance'].astype(int)

# Stratified Shuffled test dataframe  
X_test_df = strat_test_set[rmfrst_champ_feature_var]
y_test_df = strat_test_set['job_performance'].astype(int)

# Convert Shuffled train dataframe to arrays
X_train = X_train_df.values
y_train = y_train_df.values.ravel().astype(int)

# Convert Shuffled test dataframe to arrays
X_test = X_test_df.values
y_test = y_test_df.values.ravel().astype(int)

(20000, 380)


#### CHAMPION MODEL: RANDOMFOREST

~ Choosing Champion Model : LinearRegression, SGDRegressor, RandomForestRegressor, SVR and XGBoost.  RandomForestRegressor gave us the least MSE compared to different models trained.

~ Cross Validation: Tested model stability using cross_val_score

~ Hyper Parameter Tuning: Used Grid Search to tune Hyperparameter

In [26]:
# *****************************************************************************************
# Random Forest Regressor 
# *****************************************************************************************
Rmforest = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=20, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=42,
                      verbose=0, warm_start=False)

print('-----------------------------------------------------------------------------')
print('Fitting Random Forest model on X_train and testing it on X_test dataset:')
print('-----------------------------------------------------------------------------')

# Train the model on training data
Rmforest.fit(X_train, y_train.ravel())

# Use the forest's predict method on the test data
y_pred = Rmforest.predict(X_test)
print('Mean Squared Error on Test data: {:.3f}'.format(metrics.mean_squared_error(y_test, y_pred)))
print('')
print('--------------------------------------------------')
print('Cross validation scores on X_full_data')    
print('--------------------------------------------------')
scores = cross_val_score(Rmforest, X_full_train, y_full_train, scoring="neg_mean_squared_error", cv=5)
print("Cross validation Scores: {}".format(scores))
print("Cross validation Mean: {:.3f}".format(scores.mean()))
print("Cross validation Standard deviation: {:.4f}".format(scores.std()))


-----------------------------------------------------------------------------
Fitting Random Forest model on X_train and testing it on X_test dataset:
-----------------------------------------------------------------------------
Mean Squared Error on Test data: 28915.700

--------------------------------------------------
Cross validation scores on X_full_data
--------------------------------------------------
Cross validation Scores: [-32990.83883405 -32815.93270388 -34025.74292395 -30483.91492652
 -34495.33444169]
Cross validation Mean: -32962.353
Cross validation Standard deviation: 1388.9374


#### RUNNING MODEL ON TEST DATSET AND GENERATE CSV FILE 

In [29]:
# Read Csv file 
df_test = readCsvFile(test_csv_path)
#Keeping the required columns 
df_filterd = df_test[base_data_col].copy()
# Impute categorical variables with the Mod 
df_cat_imp = imputeCatVar(df_filterd, cat_var_lst)
# Impute numerical variables with the mdeian value 
df_test_imp = imputerNumMedian(df_cat_imp, num_imp_lst)
# Scaling numerical variables 
df_test_scl = numScaling(df_test_imp, num_imp_lst)
# Mapping categorical and numerical variables
df_test_map = mapDictValue(df_test_scl)
# Create dummy indicator on categorical variables 
df_test_dmy = dummyCatFeature(df_test_map, dummy_var_lst)
# Cleaned data for modelling
cln_test_df = df_test_dmy.copy()

# Test dataframe 
X_test_df = cln_test_df[rmfrst_champ_feature_var]

# Convert test dataframe to array
X_test_values = X_test_df.values
print('count test data :{}'.format(len(X_test_df)))

# Predict Test data
modle_prediction = Rmforest.predict(X_test_values)
yhat = modle_prediction

print('count test data :{}'.format(len(yhat)))
y_pred = pd.DataFrame({ 'Predicted_JobPerformance': yhat}).astype(int)
finalfor = pd.concat([df_test, y_pred], axis=1)
print('shape of final data :{}'.format(finalfor.shape))

# Export test csv file 
finalfor.to_csv(test_out_csv)

(20000, 380)
count test data :20000
count test data :20000
shape of final data :(20000, 381)


#### CROSS CHECK JOB PERFORMANCE

In [36]:
# Read Csv file 
df_train = readCsvFile(train_csv_path)
#Keeping the required columns 
df_filterd = df_train[base_data_col].copy()
# Impute categorical variables with the Mod 
df_cat_imp = imputeCatVar(df_filterd, cat_var_lst)
# Impute numerical variables with the mdeian value 
df_train_imp = imputerNumMedian(df_cat_imp, num_imp_lst)
# Scaling numerical variables 
df_train_scl = numScaling(df_train_imp, num_imp_lst)
# Mapping categorical and numerical variables
df_train_map = mapDictValue(df_train_scl)
# Create dummy indicator on categorical variables 
df_train_dmy = dummyCatFeature(df_train_map, dummy_var_lst)
# Cleaned data for modelling
cln_train_df = df_train_dmy.copy()

# Train dataframe 
X_train_df = cln_train_df[rmfrst_champ_feature_var]

# Convert train dataframe to array
X_train_values = X_train_df.values
print('count test data :{}'.format(len(X_train_values)))

# Predict Test data
modle_prediction = Rmforest.predict(X_train_values)
yhat =modle_prediction

print('count test data :{}'.format(len(yhat)))

y_pred = pd.DataFrame({ 'Predict_JobPerformance': yhat}).astype(int)
finaltrain = pd.concat([df_train, y_pred], axis=1).copy()
print('shape of final data :{}'.format(finaltrain.shape))

finaltrain[['job_performance','Predict_JobPerformance']].astype(int)

(20000, 380)
count test data :20000
count test data :20000
shape of final data :(20000, 381)


Unnamed: 0,job_performance,Predict_JobPerformance
0,3164,3164
1,2673,2671
2,2701,2703
3,2289,2285
4,2349,2349
5,3233,2836
6,1550,1545
7,3421,3421
8,3628,3628
9,2677,2678
