# Installing Libraries

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

# for the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# for evaluation metrics of the models
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_score, recall_score, f1_score

# Reading Dataset

### Reading Train Dataset

In [2]:
train_data = pd.read_csv("bag_difficult_train.csv")
X_train = train_data.drop(["category"], axis =  "columns")
y_train = train_data[['category']]
train_data.head()

Unnamed: 0,category,Job,knowledge,assessment,information,Experience,vulnerability,ÊÊ,Business,skill,...,solution,UX,quality,analytical,control,audit,new,team,opportunity,technology
0,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117808,0.0,0.143129
1,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.071098,0.152323,0.0,0.123375
2,software+engineer,0.0,0.080771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.290684,0.0,0.0,0.068451,0.0,0.0
3,ui+ux,0.0,0.0,0.120286,0.073285,0.0,0.133085,0.0,0.0,0.112439,...,0.068614,0.0,0.0,0.0,0.0,0.0,0.068919,0.098438,0.0,0.059797
4,ui+ux,0.0,0.041995,0.0,0.0,0.0,0.0,0.0,0.0,0.040652,...,0.0,0.0,0.063359,0.0,0.0,0.0,0.09967,0.142358,0.229782,0.043239


### Reading Test Dataset

In [3]:
test_data = pd.read_csv("bag_difficult_test.csv")
X_test = test_data.drop(["category"], axis =  "columns")
y_test = test_data[['category']]
test_data.head()

Unnamed: 0,category,Job,knowledge,assessment,information,Experience,vulnerability,ÊÊ,Business,skill,...,solution,UX,quality,analytical,control,audit,new,team,opportunity,technology
0,data+analyst,0.0,0.173064,0.0,0.0,0.0,0.0,0.0,0.0,0.248572,...,0.0,0.0,0.0,0.132009,0.0,0.0,0.0,0.0,0.116499,0.0
1,cyber+security,0.0,0.10125,0.0,0.127235,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085353,0.136315,0.0
2,software+engineer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.175637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13856,0.100835
3,ui+ux,0.0,0.073967,0.0,0.046475,0.0,0.0,0.0,0.0,0.141651,...,0.462844,0.0,0.05411,0.05642,0.0,0.0,0.042015,0.09353,0.049791,0.07247
4,ui+ux,0.0,0.051684,0.0,0.064949,0.0,0.0,0.0,0.0,0.148468,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043569,0.0,0.202553


# Encode Y values

Encode the string labels in y_train and y_test to numerical labels using the LabelEncoder class

In [4]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# Models
1. Logistic Regression 
2. SVM
3. Adaboost
4. Gradient Boosting
5. Gaussian NB
6. Decision Tree 
7. Random forest 
8. XGBoost

### 1. Logistic Regression

In [5]:
logistic_regression_model = LogisticRegression(random_state = 0)
logistic_regression_model.fit(X_train, y_train_encoded)

logistic_regression_model_y_pred = logistic_regression_model.predict(X_test)
logistic_regression_model_y_pred_prob = logistic_regression_model.predict_proba(X_test)

print(classification_report(y_test_encoded, logistic_regression_model_y_pred, target_names = label_encoder.classes_))
logistic_regression_accuracy = accuracy_score(y_test_encoded, logistic_regression_model_y_pred)
logistic_regression_precision = precision_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_recall = recall_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_f1 = f1_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_auc = roc_auc_score(y_test_encoded, logistic_regression_model_y_pred_prob, multi_class = "ovo")
logistic_regression_metrics = [logistic_regression_accuracy, logistic_regression_precision, logistic_regression_recall, logistic_regression_f1, logistic_regression_auc]
pd.DataFrame(logistic_regression_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Logistic Regression"]).T


                   precision    recall  f1-score   support

   cyber+security       0.98      0.95      0.97       212
     data+analyst       0.86      0.93      0.89       172
software+engineer       0.86      0.84      0.85       211
            ui+ux       0.89      0.86      0.88       200

         accuracy                           0.90       795
        macro avg       0.90      0.90      0.90       795
     weighted avg       0.90      0.90      0.90       795



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Logistic Regression,0.896855,0.89582,0.897916,0.896374,0.981424


### 2. SVM

In [6]:
svm_model = SVC(random_state = 0, kernel = "rbf", probability = True)
svm_model.fit(X_train, y_train_encoded)

svm_model_y_pred = svm_model.predict(X_test)
svm_model_y_pred_prob = svm_model.predict_proba(X_test)

print(classification_report(y_test_encoded, svm_model_y_pred, target_names = label_encoder.classes_))
svm_accuracy = accuracy_score(y_test_encoded, svm_model_y_pred)
svm_precision = precision_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_recall = recall_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_f1 = f1_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_auc = roc_auc_score(y_test_encoded, svm_model_y_pred_prob, multi_class = "ovo")
svm_metrics = [svm_accuracy, svm_precision, svm_recall, svm_f1, svm_auc]
pd.DataFrame(svm_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["SVM"]).T


                   precision    recall  f1-score   support

   cyber+security       0.97      0.98      0.97       212
     data+analyst       0.93      0.95      0.94       172
software+engineer       0.88      0.92      0.90       211
            ui+ux       0.94      0.87      0.90       200

         accuracy                           0.93       795
        macro avg       0.93      0.93      0.93       795
     weighted avg       0.93      0.93      0.93       795



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
SVM,0.928302,0.928946,0.92838,0.928185,0.990355


### 3. Adaboost

In [7]:
adaboost_model = AdaBoostClassifier(random_state = 0)
adaboost_model.fit(X_train, y_train_encoded)

adaboost_model_y_pred = adaboost_model.predict(X_test)
adaboost_model_y_pred_prob = adaboost_model.predict_proba(X_test)

print(classification_report(y_test_encoded, adaboost_model_y_pred, target_names = label_encoder.classes_))
adaboost_accuracy = accuracy_score(y_test_encoded, adaboost_model_y_pred)
adaboost_precision = precision_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_recall = recall_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_f1 = f1_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_auc = roc_auc_score(y_test_encoded, adaboost_model_y_pred_prob, multi_class = "ovo")
adaboost_metrics = [adaboost_accuracy, adaboost_precision, adaboost_recall, adaboost_f1, adaboost_auc]
pd.DataFrame(adaboost_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Adaboost"]).T


                   precision    recall  f1-score   support

   cyber+security       0.97      0.99      0.98       212
     data+analyst       0.84      0.92      0.88       172
software+engineer       0.66      0.70      0.68       211
            ui+ux       0.67      0.56      0.61       200

         accuracy                           0.79       795
        macro avg       0.78      0.79      0.78       795
     weighted avg       0.78      0.79      0.78       795



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Adaboost,0.787421,0.7831,0.790284,0.784915,0.9208


### 4. Gradient Boosting

In [8]:
gradient_boosting_model = GradientBoostingClassifier(random_state = 0)
gradient_boosting_model.fit(X_train, y_train_encoded)

gradient_boosting_model_y_pred = gradient_boosting_model.predict(X_test)
gradient_boosting_model_y_pred_prob = gradient_boosting_model.predict_proba(X_test)

print(classification_report(y_test_encoded, gradient_boosting_model_y_pred, target_names = label_encoder.classes_))
gradient_boosting_accuracy = accuracy_score(y_test_encoded, gradient_boosting_model_y_pred)
gradient_boosting_precision = precision_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_recall = recall_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_f1 = f1_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_auc = roc_auc_score(y_test_encoded, gradient_boosting_model_y_pred_prob, multi_class = "ovo")
gradient_boosting_metrics = [gradient_boosting_accuracy, gradient_boosting_precision, gradient_boosting_recall, gradient_boosting_f1, gradient_boosting_auc]
pd.DataFrame(gradient_boosting_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Gradient Boosting"]).T


                   precision    recall  f1-score   support

   cyber+security       0.96      1.00      0.98       212
     data+analyst       0.93      0.94      0.94       172
software+engineer       0.87      0.91      0.89       211
            ui+ux       0.94      0.85      0.90       200

         accuracy                           0.92       795
        macro avg       0.93      0.92      0.92       795
     weighted avg       0.93      0.92      0.92       795



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Gradient Boosting,0.924528,0.925458,0.924339,0.924232,0.988924


### 5. Gaussian NB

In [9]:
gaussiannb_model = GaussianNB()
gaussiannb_model.fit(X_train, y_train_encoded)

gaussiannb_model_y_pred = gaussiannb_model.predict(X_test)
gaussiannb_model_y_pred_prob = gaussiannb_model.predict_proba(X_test)

print(classification_report(y_test_encoded, gaussiannb_model_y_pred, target_names = label_encoder.classes_))
gaussiannb_accuracy = accuracy_score(y_test_encoded, gaussiannb_model_y_pred)
gaussiannb_precision = precision_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_recall = recall_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_f1 = f1_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_auc = roc_auc_score(y_test_encoded, gaussiannb_model_y_pred_prob, multi_class = "ovo")
gaussiannb_metrics = [gaussiannb_accuracy, gaussiannb_precision, gaussiannb_recall, gaussiannb_f1, gaussiannb_auc]
pd.DataFrame(gaussiannb_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Gaussian NB"]).T


                   precision    recall  f1-score   support

   cyber+security       0.93      0.97      0.95       212
     data+analyst       0.83      0.90      0.87       172
software+engineer       0.80      0.77      0.78       211
            ui+ux       0.82      0.76      0.79       200

         accuracy                           0.85       795
        macro avg       0.85      0.85      0.85       795
     weighted avg       0.85      0.85      0.85       795



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Gaussian NB,0.849057,0.846161,0.850158,0.847373,0.95323


### 6. Decision Tree

In [10]:
decision_tree_model = DecisionTreeClassifier(random_state = 0)
decision_tree_model.fit(X_train, y_train_encoded)

decision_tree_model_y_pred = decision_tree_model.predict(X_test)
decision_tree_model_y_pred_prob = decision_tree_model.predict_proba(X_test)

print(classification_report(y_test_encoded, decision_tree_model_y_pred, target_names = label_encoder.classes_))
decision_tree_accuracy = accuracy_score(y_test_encoded, decision_tree_model_y_pred)
decision_tree_precision = precision_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_recall = recall_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_f1 = f1_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_auc = roc_auc_score(y_test_encoded, decision_tree_model_y_pred_prob, multi_class = "ovo")
decision_tree_metrics = [decision_tree_accuracy, decision_tree_precision, decision_tree_recall, decision_tree_f1, decision_tree_auc]
pd.DataFrame(decision_tree_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Decision Tree"]).T


                   precision    recall  f1-score   support

   cyber+security       0.95      0.98      0.96       212
     data+analyst       0.86      0.91      0.89       172
software+engineer       0.81      0.79      0.80       211
            ui+ux       0.83      0.79      0.81       200

         accuracy                           0.87       795
        macro avg       0.86      0.87      0.86       795
     weighted avg       0.86      0.87      0.86       795



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Decision Tree,0.865409,0.863282,0.866484,0.864495,0.911905


### 7. Random Forest

In [11]:
random_forest_model = RandomForestClassifier(random_state = 0)
random_forest_model.fit(X_train, y_train_encoded)

random_forest_model_y_pred = random_forest_model.predict(X_test)
random_forest_model_y_pred_prob = random_forest_model.predict_proba(X_test)

print(classification_report(y_test_encoded, random_forest_model_y_pred, target_names = label_encoder.classes_))
random_forest_accuracy = accuracy_score(y_test_encoded, random_forest_model_y_pred)
random_forest_precision = precision_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_recall = recall_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_f1 = f1_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_auc = roc_auc_score(y_test_encoded, random_forest_model_y_pred_prob, multi_class = "ovo")
random_forest_metrics = [random_forest_accuracy, random_forest_precision, random_forest_recall, random_forest_f1, random_forest_auc]
pd.DataFrame(random_forest_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Random Forest"]).T


                   precision    recall  f1-score   support

   cyber+security       0.97      1.00      0.98       212
     data+analyst       0.93      0.97      0.95       172
software+engineer       0.87      0.94      0.91       211
            ui+ux       0.97      0.83      0.89       200

         accuracy                           0.93       795
        macro avg       0.94      0.93      0.93       795
     weighted avg       0.94      0.93      0.93       795



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Random Forest,0.933333,0.935823,0.933382,0.932759,0.99315


### 8. XGBoost

In [12]:
xgboost_model = xgb.XGBClassifier(random_state = 0)
xgboost_model.fit(X_train, y_train_encoded)

xgboost_model_y_pred = xgboost_model.predict(X_test)
xgboost_model_y_pred_prob = xgboost_model.predict_proba(X_test)

print(classification_report(y_test_encoded, xgboost_model_y_pred, target_names = label_encoder.classes_))
xgboost_accuracy = accuracy_score(y_test_encoded, xgboost_model_y_pred)
xgboost_precision = precision_score(y_test_encoded, xgboost_model_y_pred, average = "macro")
xgboost_recall = recall_score(y_test_encoded, xgboost_model_y_pred, average = "macro")
xgboost_f1 = f1_score(y_test_encoded, xgboost_model_y_pred, average = "macro")
xgboost_auc = roc_auc_score(y_test_encoded, xgboost_model_y_pred_prob, multi_class = "ovo")
xgboost_metrics = [random_forest_accuracy, xgboost_precision, xgboost_recall, xgboost_f1, xgboost_auc]
pd.DataFrame(xgboost_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["XGBoost"]).T

                   precision    recall  f1-score   support

   cyber+security       0.95      0.99      0.97       212
     data+analyst       0.94      0.96      0.95       172
software+engineer       0.88      0.91      0.90       211
            ui+ux       0.95      0.86      0.90       200

         accuracy                           0.93       795
        macro avg       0.93      0.93      0.93       795
     weighted avg       0.93      0.93      0.93       795



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
XGBoost,0.933333,0.931186,0.929961,0.929831,0.992135


# Models Summary

In [13]:
all_metrics = [logistic_regression_metrics, svm_metrics, adaboost_metrics, gradient_boosting_metrics, gaussiannb_metrics, decision_tree_metrics, random_forest_metrics, xgboost_metrics]
metrics_summary = pd.DataFrame(all_metrics, columns = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], index = ["Logistic Regression", "SVM", "Adaboost", "Gradient Boosting", "Gaussian NB", "Decision Tree", "Random Forest", "XGBoost"])
metrics_summary

Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Logistic Regression,0.896855,0.89582,0.897916,0.896374,0.981424
SVM,0.928302,0.928946,0.92838,0.928185,0.990355
Adaboost,0.787421,0.7831,0.790284,0.784915,0.9208
Gradient Boosting,0.924528,0.925458,0.924339,0.924232,0.988924
Gaussian NB,0.849057,0.846161,0.850158,0.847373,0.95323
Decision Tree,0.865409,0.863282,0.866484,0.864495,0.911905
Random Forest,0.933333,0.935823,0.933382,0.932759,0.99315
XGBoost,0.933333,0.931186,0.929961,0.929831,0.992135


## Model Ensemble

In [14]:
import modelEnsemble
import importlib
importlib.reload(modelEnsemble)
estimators = [('svm', svm_model), ('rf', random_forest_model), ('xgboost', xgboost_model)]
vs = modelEnsemble.modelEnsemble(estimators, X_train, y_train_encoded, X_test, y_test_encoded)

In [15]:
vsreport = vs.voting()
vsreport

Unnamed: 0,Accuracy,Precision,Recall,F1-Measure
Voting-Hard,0.940881,0.942454,0.941023,0.940875
Voting-Soft,0.937107,0.93816,0.937271,0.937241


In [16]:
vsreport = vs.stacking(gradient_boosting_model)
vsreport

Unnamed: 0,Stacking
Accuracy,0.925786
Precision,0.926214
Recall,0.926225
F1-Measure,0.925897


# Predicting Resume Output

In [20]:
import importlib
import resume_process
importlib.reload(resume_process)

x, _ = resume_process.process_resume("resume_se.txt", "difficult") 
label_encoder.inverse_transform(vs.predict(x))



array(['software+engineer'], dtype=object)