# Final Project Data Processing - Intro to Machine Learning
CS-UY 4563 <br>
Spring 2021 <br>
Lujie Zhao, Sicong Liu <br>
April 28, 2021 <br>

This project is to predict whether the job candidate will be willing to work for the
company after training, or they plan to go to another company. The Github Repo can be found here [https://github.com/LujieZ/cs4563-machine-learning]

In [31]:
import numpy
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline 

# Import Dataset

The dataset we are using is from kaggle website: [https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists], which includes 13 inputs (12 being valid) and one target (1 for accept the offer and 0 for reject the offer). We will use aug_train.csv for training and aug_test.csv for testing. 


In [32]:
# 1. Read the file into a pandas data frame and print the headers.
df = pd.read_csv("aug_train.csv")
df = df.drop(columns=["enrollee_id","city"])
print(df.shape)
df_header = df.columns.values
print(df_header)

(19158, 12)
['city_development_index' 'gender' 'relevent_experience'
 'enrolled_university' 'education_level' 'major_discipline' 'experience'
 'company_size' 'company_type' 'last_new_job' 'training_hours' 'target']


In [33]:
# 2. Drop all rows that contains NaN values.
# We want to drop the rows that contain NaN values, except the `major_discipline`. Because we
# have some enrolleees with high school or lower degrees, they are self-educated and majors
# appearly don't apply to them.
df["major_discipline"] = df["major_discipline"].fillna("Not Applied")
# There's also typos in the `company_size` columns, and we need to replace `10/49` with
# `10-49`.
df["company_size"] = df["company_size"].replace("10/49","10-49")
# print(df["major_discipline"].unique())
# print(df["company_size"].unique())

df = df.dropna()
print(df.shape)

(9615, 12)


In [34]:
# 3. Print unique values for each column.
# Since `entrolled_id` and `city` are identifiers for enrollees, they are not considered
# as our input. And for rest of the columns, `gender`, `relevent_experience`,
# `enrolled_university`, `education_level`, `major_discipline`, `company_size`,
# `company_type` are not numerical values. Thus they need to be classied.
print(df["gender"].unique())
print(df["relevent_experience"].unique())
print(df["enrolled_university"].unique())
print(df["education_level"].unique())
print(df["major_discipline"].unique())
print(df["company_size"].unique())
print(df["company_type"].unique())

['Male' 'Female' 'Other']
['No relevent experience' 'Has relevent experience']
['no_enrollment' 'Part time course' 'Full time course']
['Graduate' 'Masters' 'High School' 'Phd' 'Primary School']
['STEM' 'Not Applied' 'Humanities' 'Business Degree' 'Other' 'No Major'
 'Arts']
['50-99' '<10' '5000-9999' '1000-4999' '10-49' '100-500' '10000+'
 '500-999']
['Pvt Ltd' 'Funded Startup' 'Early Stage Startup' 'Other' 'Public Sector'
 'NGO']


# Process Data
Replace the text value with preset parameters. 

In [35]:
# 4. Replace different gender text with numbers.
# 0 - Female
# 1 - Male
# 2 - Other
df["gender"] = df["gender"].replace(["Female","Male","Other"], [0,1,2])
print(df["gender"].unique())

[1 0 2]


In [36]:
# 5. Replace relevent experience text with numbers.
# 0 - No relevent experience
# 1 - Has relevent experience
df["relevent_experience"] = df["relevent_experience"].replace(["No relevent experience","Has relevent experience"],[0,1])
print(df["relevent_experience"].unique())

[0 1]


In [37]:
# 6. Replace current enrollment with numbers.
# 0 - no_enrollment
# 1 - Full time course
# 2 - Part time course
df["enrolled_university"] = df["enrolled_university"].replace(["no_enrollment","Full time course","Part time course"],[0,1,2])
print(df["enrolled_university"].unique())

[0 2 1]


In [38]:
# 7. Replace education level with numbers.
# 1 - Primary School
# 2 - High School
# 3 - Graduate (Bachelor)
# 4 - Masters
# 5 - Phd
df["education_level"] = df["education_level"].replace(["Primary School","High School","Graduate","Masters","Phd"],[1,2,3,4,5])
print(df["education_level"].unique())

[3 4 2 5 1]


In [39]:
# 8. Replace major disciplines with numbers.
# 0 - Not Applied
# 1 - Arts
# 2 - Business Degree
# 3 - Humanities
# 4 - No Major
# 5 - STEM
# 6 - Other
df["major_discipline"] = df["major_discipline"].replace(["Not Applied", "Arts", "Business Degree", "Humanities", "No Major", "STEM", "Other"],[0,1,2,3,4,5,6])
print(df["major_discipline"].unique())

[5 0 3 2 6 4 1]


In [40]:
# 9. Replace company sizes with numbers.
# 1 - <10
# 2 - 10-49
# 3 - 50-99
# 4 - 100-500
# 5 - 500-999
# 6 - 1000-4999
# 7 - 5000-9999
# 8 - 10000+
df["company_size"] = df["company_size"].replace(["<10","10-49","50-99","100-500","500-999","1000-4999","5000-9999","10000+"],[1,2,3,4,5,6,7,8])
print(df["company_size"].unique())

[3 1 7 6 2 4 8 5]


In [41]:
# 10. Replace company types with numbers.
# 1 - Public Sector
# 2 - Pvt Ltd (Private Limited)
# 3 - Funded Startup
# 4 - Early Stage Startup
# 5 - NGO
# 6 - Other
df["company_type"] = df["company_type"].replace(["Public Sector","Pvt Ltd","Funded Startup","Early Stage Startup","NGO","Other"],[1,2,3,4,5,6])
print(df["company_type"].unique())

[2 3 4 6 1 5]


In [42]:
# Check the table after classification.
print(df)

       city_development_index  gender  relevent_experience  \
1                       0.776       1                    0   
4                       0.767       1                    1   
6                       0.920       1                    1   
7                       0.762       1                    1   
8                       0.920       1                    1   
...                       ...     ...                  ...   
19149                   0.920       1                    1   
19150                   0.920       0                    1   
19152                   0.920       0                    1   
19155                   0.920       1                    1   
19156                   0.802       1                    1   

       enrolled_university  education_level  major_discipline experience  \
1                        0                3                 5         15   
4                        0                4                 5        >20   
6                        0 

# Deal with Approximation
For columns `experience` and `last_new_job`, we can see there are approximated data for experience more than 20 years and last new jobs more then 4 years. As they are values to be determine, we will approximate them with 21 years and 5 years.

In [43]:
print(df["experience"].unique())
print(df["last_new_job"].unique())

['15' '>20' '5' '13' '7' '16' '11' '<1' '18' '19' '12' '10' '1' '4' '9'
 '2' '6' '14' '3' '8' '17' '20']
['>4' '4' '1' '3' '2' 'never']


In [44]:
df["experience"] = df["experience"].replace([">20","<1"],["21","0"])
df["last_new_job"] = df["last_new_job"].replace([">4","never"],["5","0"])
print(df)

       city_development_index  gender  relevent_experience  \
1                       0.776       1                    0   
4                       0.767       1                    1   
6                       0.920       1                    1   
7                       0.762       1                    1   
8                       0.920       1                    1   
...                       ...     ...                  ...   
19149                   0.920       1                    1   
19150                   0.920       0                    1   
19152                   0.920       0                    1   
19155                   0.920       1                    1   
19156                   0.802       1                    1   

       enrolled_university  education_level  major_discipline experience  \
1                        0                3                 5         15   
4                        0                4                 5         21   
6                        0 

In [45]:
df.to_csv('data.csv', index = False)