In [1]:
#Imports
import numpy as np
import pandas as pd

#create dataframes
train  = pd.read_csv("aug_train.csv")
test  = pd.read_csv("aug_test.csv")


In [2]:
#Task1 Data clean, imputation
#1. in experience, replace >20 to 21; <1 to 1, and convert this as a numerical column

train.loc[train["experience"] == ">20", "experience"] = 21
train.loc[train["experience"] == "<1", "experience"] = 1
test.loc[test["experience"] == ">20", "experience"] = 21
test.loc[test["experience"] == "<1", "experience"] = 1

#convert to numeric
test["experience"] = pd.to_numeric(test["experience"])
train["experience"] = pd.to_numeric(train["experience"])


#2. in last_new_job, replace >4 to 5; never to 0, and convert this as a numerical column

train.loc[train["last_new_job"] == ">4", "last_new_job"] = 5
train.loc[train["last_new_job"] == "never", "last_new_job"] = 0
test.loc[test["last_new_job"] == ">4", "last_new_job"] = 5
test.loc[test["last_new_job"] == "never", "last_new_job"] = 0

#convert to numeric
test["last_new_job"] = pd.to_numeric(test["last_new_job"])
train["last_new_job"] = pd.to_numeric(train["last_new_job"])

In [3]:
#3. If the column is categorical, impute the missing value as its mode. 
#If the column is numerical, impute the missing value as its median

for column in test:
    if test[column].dtype == "object":       
        test[column] = test[column].fillna(test[column].mode()[0])
    else:
        test[column] = test[column].fillna(test[column].median())

for column in train:
    if train[column].dtype == "object":       
        train[column] = train[column].fillna(train[column].mode()[0])
    else:
        train[column] = train[column].fillna(train[column].median())


In [4]:
#Further preparations for classification:
#get X and y
features= ["city_development_index","gender","relevent_experience","enrolled_university","education_level","major_discipline","experience","company_type","last_new_job","training_hours"]
y_train = train[["target"]]
X_train = train[features]
y_test = test[["target"]]
X_test = test[features]

In [5]:
#Further preparations for classification:
#Transform categorical data with one Hot encoding

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features =['gender','relevent_experience', 'enrolled_university', 'education_level','major_discipline','company_type']

preprocessor = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), categorical_features)],
    remainder='passthrough'
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [6]:
#Task2 Classification
#1. Build a classification model from the training set (you can use any algorithm)
from sklearn.tree import DecisionTreeClassifier

#Use decision tree classifier, adding class_weight=balanced to hopefully account for imbalanced data set
clf = DecisionTreeClassifier(class_weight='balanced')
#fit model
clf.fit(X_train, y_train)
#make predicion (on training)
y_train_pred = clf.predict(X_train)




In [7]:
#2. generate the confusion matrix for the training set and calculate the accuracy, precision, recall, and F1-score

from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score, f1_score

cm_train = confusion_matrix(y_train, y_train_pred)
accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)
f1_train = f1_score(y_train, y_train_pred)




In [8]:
#3. Applying the model in the test set and generating the prediction
y_test_pred = clf.predict(X_test)


In [9]:
#4. generate the confusion matrix from the test set and calculate the accuracy, precision, recall, and F1-score
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)



In [10]:
#5. compare the results between the training and test set
print("TRAINING SET | TEST SET")
print("Confusion matrix")
print(cm_train)
print(cm_test)
print("Accuracy:", accuracy_train,"|", accuracy_test)
print("Precision:", precision_train,"|", precision_test)
print("Recall:", recall_train,"|", recall_test)
print("F1 Score:", f1_train, "|", f1_test)

#All the scores for the training data are much higher. This is no surprise, since the model is trained on this data after all and therefore the classifier is very tuned into the data of the training set. This is called overfitting.  

TRAINING SET | TEST SET
Confusion matrix
[[1563    2]
 [   0  535]]
[[60 18]
 [12 10]]
Accuracy: 0.9990476190476191 | 0.7
Precision: 0.9962756052141527 | 0.35714285714285715
Recall: 1.0 | 0.45454545454545453
F1 Score: 0.9981343283582089 | 0.4
