# Installing Libraries

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

# for the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# for evaluation metrics of the models
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_score, recall_score, f1_score
import modelEnsemble as me

# Reading Dataset

### Reading Train Dataset

In [2]:
train_data = pd.read_csv("embedding_difficult_train.csv")
X_train = train_data.drop(["category"], axis =  "columns")
y_train = train_data[['category']]
train_data.head()

Unnamed: 0,category,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,software+engineer,-0.067028,-0.037456,0.04071,-0.022643,-0.065919,-0.011505,0.014374,0.070613,-0.072635,...,0.073042,-0.004869,-0.01273,-0.012608,0.025585,0.136745,0.038581,-0.099147,0.001722,0.032735
1,software+engineer,-0.049825,-0.026897,-0.014653,-0.04533,-0.002588,-0.085351,-0.020295,-0.026342,-0.046522,...,0.153783,0.06003,0.053321,-0.115148,0.043106,0.170601,-0.04041,-0.03929,0.035469,0.022973
2,software+engineer,-0.109452,-0.018895,-0.011882,-0.049141,-0.065425,-0.020879,0.020536,0.049572,-0.068171,...,0.067979,0.06065,0.008373,-0.073202,0.03943,0.084493,-0.058907,0.041129,-0.041035,-0.106831
3,ui+ux,-0.059268,0.002888,0.038257,0.000755,-0.040839,0.018103,0.136779,0.048689,-0.019846,...,-0.003655,0.047657,0.028251,-0.037878,0.034004,0.074488,0.008779,-0.027154,-0.01263,0.000571
4,ui+ux,-0.012034,0.015511,-0.003249,-0.078507,0.06504,-0.028536,-0.031559,0.013882,0.02209,...,0.019392,0.086536,-0.00624,0.011584,-0.04729,0.095525,-0.002575,-0.077631,-0.090518,0.043476


### Reading Test Dataset

In [3]:
test_data = pd.read_csv("embedding_difficult_test.csv")
X_test = test_data.drop(["category"], axis =  "columns")
y_test = test_data[['category']]
test_data.head()

Unnamed: 0,category,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,data+analyst,-0.067792,-0.039427,-0.044609,0.047954,-0.020255,0.046215,0.005775,0.025648,-0.049265,...,-0.005009,0.04023,-0.025145,0.057344,-0.027139,0.044581,0.021295,-0.058819,-0.034744,0.034043
1,cyber+security,-0.031839,-0.009482,-0.023652,-0.019206,0.04417,-0.014529,0.033187,0.019848,-0.066667,...,0.024783,0.006127,0.019935,-0.044377,0.023766,0.079525,0.047992,-0.133125,-0.075094,0.007815
2,software+engineer,0.000286,0.063366,0.006427,-0.068235,0.004449,-0.010305,0.060434,-0.016027,-0.02525,...,0.045451,0.042614,0.058757,-0.078012,0.07435,0.160043,0.063957,-0.004549,-0.031632,0.029312
3,ui+ux,-0.05765,0.05561,0.026229,-0.047242,-0.035953,-0.04672,0.011318,0.081929,-0.048681,...,0.061916,0.065591,0.01331,-0.06066,0.049816,0.041116,-0.045918,0.072459,0.037671,0.06558
4,ui+ux,-0.089035,-0.014104,-0.00834,-0.051578,0.020056,-0.007032,0.044202,0.059013,-0.116712,...,0.083643,0.00103,0.027071,-0.058678,0.040474,0.126645,0.022273,-0.124107,-0.037943,0.056573


# Encode Y values

Encode the string labels in y_train and y_test to numerical labels using the LabelEncoder class

In [4]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [5]:
report = me.modelReport(y_test_encoded)

# Models
1. Logistic Regression 
2. SVM 
3. Adaboost
4. Gradient Boosting
5. Gaussian NB
6. Decision Tree 
7. Random forest 
8. XGBoost

### 1. Logistic Regression

In [6]:
logistic_regression_model = LogisticRegression(random_state = 0)
logistic_regression_model.fit(X_train, y_train_encoded)

logistic_regression_model_y_pred = logistic_regression_model.predict(X_test)
logistic_regression_model_y_pred_prob = logistic_regression_model.predict_proba(X_test)

print(classification_report(y_test_encoded, logistic_regression_model_y_pred, target_names = label_encoder.classes_))
report.addmodel(logistic_regression_model_y_pred_prob, 'Logistic-Regression', logistic_regression_model_y_pred)

                   precision    recall  f1-score   support

   cyber+security       0.97      0.95      0.96       212
     data+analyst       0.87      0.94      0.90       172
software+engineer       0.90      0.86      0.88       211
            ui+ux       0.90      0.90      0.90       200

         accuracy                           0.91       795
        macro avg       0.91      0.91      0.91       795
     weighted avg       0.91      0.91      0.91       795



### 2. SVM 

In [7]:
svm_model = SVC(random_state = 0, kernel = "rbf", probability = True)
svm_model.fit(X_train, y_train_encoded)

svm_model_y_pred = svm_model.predict(X_test)
svm_model_y_pred_prob = svm_model.predict_proba(X_test)

print(classification_report(y_test_encoded, svm_model_y_pred, target_names = label_encoder.classes_))
report.addmodel(svm_model_y_pred_prob, 'SVM', svm_model_y_pred)

                   precision    recall  f1-score   support

   cyber+security       0.98      0.96      0.97       212
     data+analyst       0.89      0.97      0.93       172
software+engineer       0.90      0.95      0.93       211
            ui+ux       0.98      0.88      0.92       200

         accuracy                           0.94       795
        macro avg       0.94      0.94      0.94       795
     weighted avg       0.94      0.94      0.94       795



In [8]:
svm_model_sigmoid = SVC(random_state = 0, kernel = "sigmoid", probability = True)
svm_model_sigmoid.fit(X_train, y_train_encoded)

svm_model_sigmoid_y_pred = svm_model_sigmoid.predict(X_test)
svm_model_sigmoid_y_pred_prob = svm_model_sigmoid.predict_proba(X_test)

print(classification_report(y_test_encoded, svm_model_sigmoid_y_pred, target_names = label_encoder.classes_))
report.addmodel(svm_model_sigmoid_y_pred_prob, 'SVM-sigmoid', svm_model_sigmoid_y_pred)

                   precision    recall  f1-score   support

   cyber+security       0.94      0.94      0.94       212
     data+analyst       0.86      0.92      0.89       172
software+engineer       0.88      0.81      0.84       211
            ui+ux       0.86      0.88      0.87       200

         accuracy                           0.88       795
        macro avg       0.88      0.89      0.88       795
     weighted avg       0.88      0.88      0.88       795



In [9]:
svm_model_poly = SVC(random_state = 0, kernel = "poly", probability = True)
svm_model_poly.fit(X_train, y_train_encoded)

svm_model_poly_y_pred = svm_model_poly.predict(X_test)
svm_model_poly_y_pred_prob = svm_model_poly.predict_proba(X_test)

print(classification_report(y_test_encoded, svm_model_poly_y_pred, target_names = label_encoder.classes_))
report.addmodel(svm_model_poly_y_pred_prob, 'SVM-poly', svm_model_poly_y_pred)

                   precision    recall  f1-score   support

   cyber+security       0.98      0.98      0.98       212
     data+analyst       0.92      0.98      0.95       172
software+engineer       0.92      0.95      0.93       211
            ui+ux       0.98      0.89      0.93       200

         accuracy                           0.95       795
        macro avg       0.95      0.95      0.95       795
     weighted avg       0.95      0.95      0.95       795



In [10]:
# param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01],'kernel': ['rbf', 'poly']}
# SVMCV = GridSearchCV(SVC(),param_grid,refit=True,verbose=2, scoring='f1_macro')
# SVMCV.fit(X_train, y_train_encoded)

In [11]:
# SVMCV.best_estimator_

In [12]:
svm_model_Tune = SVC(random_state = 0, kernel = "poly", probability = True, C=10, gamma=1)
svm_model_Tune.fit(X_train, y_train_encoded)

svm_model_Tune_y_pred = svm_model_Tune.predict(X_test)
svm_model_Tune_y_pred_prob = svm_model_Tune.predict_proba(X_test)

print(classification_report(y_test_encoded, svm_model_Tune_y_pred, target_names = label_encoder.classes_))
report.addmodel(svm_model_Tune_y_pred_prob, 'SVM-tune', svm_model_Tune_y_pred)

                   precision    recall  f1-score   support

   cyber+security       0.98      0.98      0.98       212
     data+analyst       0.92      0.97      0.95       172
software+engineer       0.91      0.94      0.93       211
            ui+ux       0.98      0.90      0.93       200

         accuracy                           0.95       795
        macro avg       0.95      0.95      0.95       795
     weighted avg       0.95      0.95      0.95       795



### 3. Adaboost

In [13]:
adaboost_model = AdaBoostClassifier(random_state = 0)
adaboost_model.fit(X_train, y_train_encoded)

adaboost_model_y_pred = adaboost_model.predict(X_test)
adaboost_model_y_pred_prob = adaboost_model.predict_proba(X_test)

print(classification_report(y_test_encoded, adaboost_model_y_pred, target_names = label_encoder.classes_))
report.addmodel(adaboost_model_y_pred_prob, 'AdaBoosting', adaboost_model_y_pred)

                   precision    recall  f1-score   support

   cyber+security       0.97      0.85      0.90       212
     data+analyst       0.75      0.88      0.81       172
software+engineer       0.75      0.81      0.77       211
            ui+ux       0.88      0.79      0.83       200

         accuracy                           0.83       795
        macro avg       0.84      0.83      0.83       795
     weighted avg       0.84      0.83      0.83       795



### 4. Gradient Boosting

In [14]:
gradient_boosting_model = GradientBoostingClassifier(random_state = 0)
gradient_boosting_model.fit(X_train, y_train_encoded)

gradient_boosting_model_y_pred = gradient_boosting_model.predict(X_test)
gradient_boosting_model_y_pred_prob = gradient_boosting_model.predict_proba(X_test)

print(classification_report(y_test_encoded, gradient_boosting_model_y_pred, target_names = label_encoder.classes_))
report.addmodel(gradient_boosting_model_y_pred_prob, 'Gradient Boosting', gradient_boosting_model_y_pred)

                   precision    recall  f1-score   support

   cyber+security       0.98      0.97      0.97       212
     data+analyst       0.91      0.93      0.92       172
software+engineer       0.89      0.92      0.91       211
            ui+ux       0.94      0.89      0.91       200

         accuracy                           0.93       795
        macro avg       0.93      0.93      0.93       795
     weighted avg       0.93      0.93      0.93       795



### 5. Gaussian NB

In [15]:
gaussiannb_model = GaussianNB()
gaussiannb_model.fit(X_train, y_train_encoded)

gaussiannb_model_y_pred = gaussiannb_model.predict(X_test)
gaussiannb_model_y_pred_prob = gaussiannb_model.predict_proba(X_test)

print(classification_report(y_test_encoded, gaussiannb_model_y_pred, target_names = label_encoder.classes_))
report.addmodel(gaussiannb_model_y_pred_prob, 'Gaussian NB', gaussiannb_model_y_pred)


                   precision    recall  f1-score   support

   cyber+security       0.98      0.92      0.95       212
     data+analyst       0.82      0.93      0.87       172
software+engineer       0.84      0.86      0.85       211
            ui+ux       0.90      0.84      0.87       200

         accuracy                           0.89       795
        macro avg       0.89      0.89      0.89       795
     weighted avg       0.89      0.89      0.89       795



### 6. Decision Tree

In [16]:
decision_tree_model = DecisionTreeClassifier(random_state = 0)
decision_tree_model.fit(X_train, y_train_encoded)

decision_tree_model_y_pred = decision_tree_model.predict(X_test)
decision_tree_model_y_pred_prob = decision_tree_model.predict_proba(X_test)

print(classification_report(y_test_encoded, decision_tree_model_y_pred, target_names = label_encoder.classes_))
report.addmodel(decision_tree_model_y_pred_prob, 'Decision Tree', decision_tree_model_y_pred)


                   precision    recall  f1-score   support

   cyber+security       0.91      0.91      0.91       212
     data+analyst       0.78      0.85      0.82       172
software+engineer       0.81      0.79      0.80       211
            ui+ux       0.84      0.80      0.82       200

         accuracy                           0.84       795
        macro avg       0.84      0.84      0.84       795
     weighted avg       0.84      0.84      0.84       795



### 7. Random Forest

In [17]:
# parameters = {'max_depth':range(4,10,2), 'n_estimators':range(100, 1000, 300)}
# rfCV = GridSearchCV(estimator = RandomForestClassifier(), param_grid = parameters, scoring='f1_macro')
# rfCV.fit(X_train, y_train_encoded)

In [18]:
# rfCV.best_params_, rfCV.best_score_

In [19]:
random_forest_model = RandomForestClassifier(random_state = 0, max_depth=8, n_estimators=400)
random_forest_model.fit(X_train, y_train_encoded)

random_forest_model_y_pred = random_forest_model.predict(X_test)
random_forest_model_y_pred_prob = random_forest_model.predict_proba(X_test)

print(classification_report(y_test_encoded, random_forest_model_y_pred, target_names = label_encoder.classes_))
report.addmodel(random_forest_model_y_pred_prob, 'Random Forest', random_forest_model_y_pred)


                   precision    recall  f1-score   support

   cyber+security       0.98      0.97      0.98       212
     data+analyst       0.90      0.97      0.93       172
software+engineer       0.90      0.94      0.92       211
            ui+ux       0.97      0.86      0.91       200

         accuracy                           0.94       795
        macro avg       0.94      0.94      0.94       795
     weighted avg       0.94      0.94      0.94       795



### 8. XGBoost

In [20]:
# parameters = {'max_depth':range(4,10,2), 'n_estimators':range(100, 1000, 300)}
# xgbCV = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate =0.1,min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8), 
# param_grid = parameters, scoring='f1_macro')
# xgbCV.fit(X_train, y_train_encoded)

In [21]:
# xgbCV.best_params_, xgbCV.best_score_

In [22]:
xgboost_model = xgb.XGBClassifier(random_state = 0, max_depth=4, n_estimators=700)
xgboost_model.fit(X_train, y_train_encoded)

xgboost_model_y_pred = xgboost_model.predict(X_test)
xgboost_model_y_pred_prob = xgboost_model.predict_proba(X_test)

print(classification_report(y_test_encoded, xgboost_model_y_pred, target_names = label_encoder.classes_))
report.addmodel(xgboost_model_y_pred_prob, 'XGBoost', xgboost_model_y_pred)

                   precision    recall  f1-score   support

   cyber+security       0.98      0.98      0.98       212
     data+analyst       0.91      0.96      0.93       172
software+engineer       0.90      0.92      0.91       211
            ui+ux       0.96      0.89      0.92       200

         accuracy                           0.94       795
        macro avg       0.94      0.94      0.94       795
     weighted avg       0.94      0.94      0.94       795



# Models Summary

In [23]:
report.makeReport()

Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Logistic-Regression,0.909434,0.908069,0.910424,0.908805,0.989509
SVM,0.937107,0.937901,0.937836,0.93654,0.99382
SVM-sigmoid,0.884277,0.883035,0.885678,0.883755,0.980824
SVM-poly,0.94717,0.947539,0.947691,0.946728,0.994101
SVM-tune,0.94717,0.947541,0.947548,0.946866,0.99294
AdaBoosting,0.830189,0.837128,0.832116,0.831404,0.939936
Gradient Boosting,0.928302,0.928078,0.927775,0.927634,0.991961
Gaussian NB,0.885535,0.886224,0.886966,0.885301,0.978583
Decision Tree,0.838994,0.837258,0.839124,0.83761,0.893871
Random Forest,0.935849,0.936501,0.936236,0.935246,0.993517


## Model Ensemble

In [24]:
import modelEnsemble
import importlib
importlib.reload(modelEnsemble)

random_forest_model = RandomForestClassifier(random_state = 0, max_depth=8, n_estimators=400)
svm_model_Tune = SVC(random_state = 0, kernel = "poly", probability = True, C=10, gamma=1)
xgboost_model = xgb.XGBClassifier(random_state = 0, max_depth=4, n_estimators=700)

estimators = [('rf', random_forest_model), ('svm', svm_model_Tune), ('xgboost', xgboost_model)]
vs = modelEnsemble.modelEnsemble(estimators, X_train, y_train_encoded, X_test, y_test_encoded)

In [25]:
vsreport = vs.voting()
vsreport

Unnamed: 0,Accuracy,Precision,Recall,F1-Measure
Voting-Hard,0.943396,0.94403,0.943863,0.943098
Voting-Soft,0.942138,0.94247,0.942749,0.941779


In [26]:
vsreport = vs.stacking(gradient_boosting_model)
vsreport

Unnamed: 0,Stacking
Accuracy,0.940881
Precision,0.941177
Recall,0.941855
F1-Measure,0.940614


# Predict Resume Output

In [27]:
from resume_process import *

_, x = process_resume("resume_se.txt", "difficult") 
label_encoder.inverse_transform(vs.predict(x))



array(['software+engineer'], dtype=object)