In [1]:
import numpy as np
import pandas as pd

### Import training and testing data

In [2]:
trainin = pd.read_csv("trainingset.csv")
testin = pd.read_csv("testset.csv")
testinraw = testin 
comin = trainin.append(testin)
ytrainraw = comin['job_performance'][0:20000]
comin = comin.drop('job_performance', axis=1)
r,c = comin.shape
r,c

(40000, 379)

### Remove features contain too many NaN

In [3]:
comin = comin.dropna(axis=1,thresh = 20000) # decrease the number of nan to be <row size /2 in a column
r,c = comin.shape
r,c

(40000, 235)

###  Extract numerical features, convert their NaN into mean

In [4]:
comnum = comin.select_dtypes(include=[np.float32,np.int32,np.float64,np.int64], exclude=None)
comnum = comnum.columns.values.tolist()
comnum = comin[comnum]
comnum = comnum.fillna(comnum.mean())
r,c = comnum.shape
r,c

(40000, 35)

### Extract categorical features, convert their NaN into mode, and convert them using One Hot Encoding

In [5]:
comstr = comin.select_dtypes(include=None, exclude=[np.float32,np.int32,np.float64,np.int64])
comstrmode = comstr.mode()
for column in comstr:
    eachcol = comstr[column]
    eachmode = comstrmode[column][0]
    eachcol.fillna(eachmode, inplace=True)
comstr = pd.get_dummies(comstr)
r,c = comstr.shape
r,c

(40000, 2686)

### Combine processed numerical and categorical features

In [6]:
comin = pd.concat([comnum,comstr],axis=1)
r,c = comin.shape
r,c

(40000, 2721)

### From coorelation between features and the lebal, only keep the ones above 0.1

In [7]:
import seaborn as sns
corrtoy = pd.concat([ytrainraw ,comin[0:20000]],axis=1)
corrtoy = corrtoy.corr() 
corrtoy = corrtoy['job_performance']
corrtoy = np.abs(corrtoy)

In [8]:
ftokeep = corrtoy[corrtoy > 0.1] 
ftokeep = list(ftokeep.index)
del ftokeep[0]
comin = comin.loc[:,ftokeep]
r,c = comin.shape
r,c

(40000, 241)

### Extract Training Data, Validating Data and Testing Data

In [9]:
from sklearn.model_selection import train_test_split
testin = comin[20000:40000]

trainin = pd.concat([ytrainraw ,comin[0:20000]],axis=1)
trainin, validatein = train_test_split(trainin, test_size = 0.2)

y_train = trainin[['job_performance']]
x_train = trainin.drop('job_performance', axis=1)

y_validate = validatein[['job_performance']]
x_validate = validatein.drop('job_performance', axis=1)

### Machine Learning task using Random Forest

In [10]:
from sklearn.ensemble import RandomForestRegressor
modelforest = RandomForestRegressor(random_state=10,n_estimators=61)
modelforest.fit(x_train,np.ravel(y_train)) 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=61, n_jobs=None,
           oob_score=False, random_state=10, verbose=0, warm_start=False)

In [11]:
from sklearn.metrics import mean_squared_error 
y_predict = modelforest.predict(x_validate)
mse = mean_squared_error(y_predict, y_validate)
print("Random Forest Mean Squared Error: ", round(mse, 2))

Random Forest Mean Squared Error:  29592.19


### Output labels for testing data

In [12]:
y_submit = modelforest.predict(testin)
testinraw['job_performance'] = y_submit
testinraw

Unnamed: 0,cntryid,cntryid_e,age_r,gender_r,computerexperience,nativespeaker,edlevel3,monthlyincpr,yearlyincpr,lng_home,...,v85,v50,v89,v127,v239,v224,v71,v105,row,uni
0,Ireland,Ireland,25.0,Female,Yes,Yes,High,50 to less than 75,50 to less than 75,999,...,Strongly agree,Agree,,,9999.0,9996.0,9999.0,9996.0,196612,UNI_Author
1,Canada,Canada (English),,Female,Yes,No,High,25 to less than 50,25 to less than 50,999,...,Agree,Agree,,,9999.0,9996.0,9999.0,9999.0,7579,UNI_Author
2,Germany,Germany,,Female,Yes,Yes,Medium,,,999,...,Strongly agree,Strongly agree,,,9996.0,9999.0,9996.0,9999.0,24646,UNI_Author
3,Finland,Finland,18.0,Female,Yes,Yes,Low,,,fin,...,Disagree,Neither agree nor disagree,,,9996.0,9999.0,9996.0,9999.0,77790,UNI_Author
4,United Kingdom,England (UK),61.0,Female,Yes,Yes,High,,,eng,...,Disagree,Agree,,,9996.0,9996.0,9996.0,9996.0,123782,UNI_Author
5,Spain,Spain,52.0,Female,Yes,Yes,Low,,,spa,...,Neither agree nor disagree,Agree,,,9999.0,9996.0,9999.0,9996.0,15334,UNI_Author
6,Germany,Germany,,Male,Yes,Yes,Low,,,999,...,Agree,Agree,,,9996.0,9996.0,9996.0,9996.0,3050,UNI_Author
7,Korea,Korea,41.0,Male,Yes,Yes,High,90 or more,90 or more,kor,...,Neither agree nor disagree,Agree,,,7322.0,9996.0,1812.0,9996.0,95729,UNI_Author
8,United States,United States,,Female,Yes,No,High,25 to less than 50,25 to less than 50,999,...,Disagree,Strongly agree,,,9999.0,9996.0,9999.0,9996.0,27113,UNI_Author
9,United States,United States,,Female,Yes,Yes,Low,,,999,...,Strongly agree,Strongly agree,,,9996.0,9996.0,9996.0,9996.0,146202,UNI_Author


In [13]:
testinraw.to_csv('testset-UNI_Author-submission.csv',index=None)