# Installing Libraries

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

# for the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# for evaluation metrics of the models
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_score, recall_score, f1_score

# Reading Dataset

### Reading Train Dataset

In [2]:
train_data = pd.read_csv("embedding_easy_train.csv")
X_train = train_data.drop(["category"], axis =  "columns")
y_train = train_data[['category']]
train_data.head()

Unnamed: 0,category,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,software+engineer,-0.007818,0.015209,-0.032915,-0.040774,-0.075819,-0.122257,0.003905,0.105021,-0.081435,...,0.036517,0.067738,0.017234,-0.021146,-0.005016,0.052575,0.016892,0.011725,0.031423,0.008097
1,arts,-0.039281,-0.037813,-0.002049,-0.079996,-0.0093,0.014721,0.028362,-0.028484,-0.047296,...,0.06137,0.030016,-0.020944,0.025066,-0.054447,0.042434,-0.01648,-0.065129,-0.121761,0.036031
2,hr,-0.099071,0.067754,-0.01872,0.065808,-0.019722,-0.006825,-0.010283,0.00154,-0.087364,...,0.081999,0.114448,0.022363,-0.051149,0.002981,0.086356,0.092362,-0.042364,-0.033669,0.060429
3,arts,-0.000244,-0.010329,0.005088,0.019951,0.031961,-0.004796,-0.056017,0.008362,-0.058531,...,0.018572,0.015595,0.026521,0.006298,-0.02421,0.076719,-0.001265,-0.029895,-0.037964,0.052187
4,arts,0.037742,-0.058576,0.017413,-0.007462,-0.010717,-0.022504,-0.016252,0.022974,-0.052209,...,0.015944,0.056351,0.060758,0.049852,-0.018264,0.075948,0.00191,-0.011242,-0.045209,0.029765


### Reading Test Dataset

In [3]:
test_data = pd.read_csv("embedding_easy_test.csv")
X_test = test_data.drop(["category"], axis =  "columns")
y_test = test_data[['category']]
test_data.head()

Unnamed: 0,category,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,software+engineer,-0.113818,0.008952,-0.004451,0.003387,0.041243,-0.074476,0.027237,0.074545,-0.080968,...,0.016368,0.115178,0.065218,-0.074784,0.057262,0.082102,0.059299,-0.031293,0.007032,0.071702
1,sales,-0.019982,-0.019572,-0.043766,0.038807,-0.021798,-0.003498,-0.008579,-0.007094,0.017747,...,0.023369,-0.032173,-0.008954,-0.028408,-0.041258,-0.016728,0.078255,-0.043575,0.04392,-0.008786
2,sales,-0.061115,0.037327,-0.035323,0.007244,-0.014231,0.054096,0.095702,0.017702,-0.064049,...,0.024937,0.007117,0.029717,0.036748,-0.017619,0.084793,0.017758,-0.004776,-0.06719,0.031323
3,sales,-0.11086,0.012782,-0.030584,-0.077872,-0.102458,0.048481,0.027282,0.055432,-0.007119,...,-0.057282,0.060048,0.01204,0.056503,-0.008248,0.102686,-0.004278,-0.015446,-0.055078,0.049244
4,arts,-0.04488,0.017243,0.052809,-0.01201,0.038006,-0.025841,0.002447,0.005194,-0.023129,...,0.014638,0.062186,0.076078,-0.041519,-0.023763,0.048595,0.060352,-0.011345,-0.059987,0.039882


# Encode Y values

Encode the string labels in y_train and y_test to numerical labels using the LabelEncoder class

In [4]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


# Models
1. Logistic Regression 
2. SVM 
3. Adaboost
4. Gradient Boosting
5. Gaussian NB
6. Decision Tree 
7. Random forest 
8. XGBoost

### 1. Logistic Regression

In [5]:
logistic_regression_model = LogisticRegression(random_state = 0)
logistic_regression_model.fit(X_train, y_train_encoded)

logistic_regression_model_y_pred = logistic_regression_model.predict(X_test)
logistic_regression_model_y_pred_prob = logistic_regression_model.predict_proba(X_test)

print(classification_report(y_test_encoded, logistic_regression_model_y_pred, target_names = label_encoder.classes_))
logistic_regression_accuracy = accuracy_score(y_test_encoded, logistic_regression_model_y_pred)
logistic_regression_precision = precision_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_recall = recall_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_f1 = f1_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_auc = roc_auc_score(y_test_encoded, logistic_regression_model_y_pred_prob, multi_class = "ovo")
logistic_regression_metrics = [logistic_regression_accuracy, logistic_regression_precision, logistic_regression_recall, logistic_regression_f1, logistic_regression_auc]
pd.DataFrame(logistic_regression_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Logistic Regression"]).T


                   precision    recall  f1-score   support

             arts       0.97      0.96      0.97       198
               hr       0.94      0.95      0.95       181
            sales       0.94      0.94      0.94       197
software+engineer       0.99      0.98      0.98       220

         accuracy                           0.96       796
        macro avg       0.96      0.96      0.96       796
     weighted avg       0.96      0.96      0.96       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Logistic Regression,0.959799,0.958767,0.959089,0.958913,0.996554


### 2. SVM

In [6]:
svm_model = SVC(random_state = 0, kernel = "rbf", probability = True)
svm_model.fit(X_train, y_train_encoded)

svm_model_y_pred = svm_model.predict(X_test)
svm_model_y_pred_prob = svm_model.predict_proba(X_test)

print(classification_report(y_test_encoded, svm_model_y_pred, target_names = label_encoder.classes_))
svm_accuracy = accuracy_score(y_test_encoded, svm_model_y_pred)
svm_precision = precision_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_recall = recall_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_f1 = f1_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_auc = roc_auc_score(y_test_encoded, svm_model_y_pred_prob, multi_class = "ovo")
svm_metrics = [svm_accuracy, svm_precision, svm_recall, svm_f1, svm_auc]
pd.DataFrame(svm_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["SVM"]).T


                   precision    recall  f1-score   support

             arts       0.98      0.97      0.98       198
               hr       0.95      0.97      0.96       181
            sales       0.96      0.95      0.95       197
software+engineer       0.99      0.98      0.98       220

         accuracy                           0.97       796
        macro avg       0.97      0.97      0.97       796
     weighted avg       0.97      0.97      0.97       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
SVM,0.969849,0.969052,0.969545,0.96924,0.995859


### 3. Adaboost

In [7]:
adaboost_model = AdaBoostClassifier(random_state = 0)
adaboost_model.fit(X_train, y_train_encoded)

adaboost_model_y_pred = adaboost_model.predict(X_test)
adaboost_model_y_pred_prob = adaboost_model.predict_proba(X_test)

print(classification_report(y_test_encoded, adaboost_model_y_pred, target_names = label_encoder.classes_))
adaboost_accuracy = accuracy_score(y_test_encoded, adaboost_model_y_pred)
adaboost_precision = precision_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_recall = recall_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_f1 = f1_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_auc = roc_auc_score(y_test_encoded, adaboost_model_y_pred_prob, multi_class = "ovo")
adaboost_metrics = [adaboost_accuracy, adaboost_precision, adaboost_recall, adaboost_f1, adaboost_auc]
pd.DataFrame(adaboost_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Adaboost"]).T


                   precision    recall  f1-score   support

             arts       0.98      0.86      0.92       198
               hr       0.91      0.85      0.87       181
            sales       0.83      0.92      0.87       197
software+engineer       0.89      0.94      0.91       220

         accuracy                           0.89       796
        macro avg       0.90      0.89      0.89       796
     weighted avg       0.90      0.89      0.89       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Adaboost,0.894472,0.899341,0.892158,0.893819,0.96785


### 4. Gradient Boosting

In [8]:
gradient_boosting_model = GradientBoostingClassifier(random_state = 0)
gradient_boosting_model.fit(X_train, y_train_encoded)

gradient_boosting_model_y_pred = gradient_boosting_model.predict(X_test)
gradient_boosting_model_y_pred_prob = gradient_boosting_model.predict_proba(X_test)

print(classification_report(y_test_encoded, gradient_boosting_model_y_pred, target_names = label_encoder.classes_))
gradient_boosting_accuracy = accuracy_score(y_test_encoded, gradient_boosting_model_y_pred)
gradient_boosting_precision = precision_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_recall = recall_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_f1 = f1_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_auc = roc_auc_score(y_test_encoded, gradient_boosting_model_y_pred_prob, multi_class = "ovo")
gradient_boosting_metrics = [gradient_boosting_accuracy, gradient_boosting_precision, gradient_boosting_recall, gradient_boosting_f1, gradient_boosting_auc]
pd.DataFrame(gradient_boosting_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Gradient Boosting"]).T


                   precision    recall  f1-score   support

             arts       0.98      0.97      0.98       198
               hr       0.94      0.96      0.95       181
            sales       0.94      0.94      0.94       197
software+engineer       0.99      0.98      0.99       220

         accuracy                           0.96       796
        macro avg       0.96      0.96      0.96       796
     weighted avg       0.97      0.96      0.96       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Gradient Boosting,0.964824,0.963735,0.964244,0.963953,0.998118


### 5. Gaussian NB

In [9]:
gaussiannb_model = GaussianNB()
gaussiannb_model.fit(X_train, y_train_encoded)

gaussiannb_model_y_pred = gaussiannb_model.predict(X_test)
gaussiannb_model_y_pred_prob = gaussiannb_model.predict_proba(X_test)

print(classification_report(y_test_encoded, gaussiannb_model_y_pred, target_names = label_encoder.classes_))
gaussiannb_accuracy = accuracy_score(y_test_encoded, gaussiannb_model_y_pred)
gaussiannb_precision = precision_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_recall = recall_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_f1 = f1_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_auc = roc_auc_score(y_test_encoded, gaussiannb_model_y_pred_prob, multi_class = "ovo")
gaussiannb_metrics = [gaussiannb_accuracy, gaussiannb_precision, gaussiannb_recall, gaussiannb_f1, gaussiannb_auc]
pd.DataFrame(gaussiannb_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Gaussian NB"]).T


                   precision    recall  f1-score   support

             arts       0.96      0.89      0.93       198
               hr       0.87      0.91      0.89       181
            sales       0.87      0.90      0.88       197
software+engineer       0.97      0.96      0.97       220

         accuracy                           0.92       796
        macro avg       0.92      0.92      0.92       796
     weighted avg       0.92      0.92      0.92       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Gaussian NB,0.918342,0.917625,0.916914,0.916755,0.985512


### 6. Decision Tree

In [10]:
decision_tree_model = DecisionTreeClassifier(random_state = 0)
decision_tree_model.fit(X_train, y_train_encoded)

decision_tree_model_y_pred = decision_tree_model.predict(X_test)
decision_tree_model_y_pred_prob = decision_tree_model.predict_proba(X_test)

print(classification_report(y_test_encoded, decision_tree_model_y_pred, target_names = label_encoder.classes_))
decision_tree_accuracy = accuracy_score(y_test_encoded, decision_tree_model_y_pred)
decision_tree_precision = precision_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_recall = recall_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_f1 = f1_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_auc = roc_auc_score(y_test_encoded, decision_tree_model_y_pred_prob, multi_class = "ovo")
decision_tree_metrics = [decision_tree_accuracy, decision_tree_precision, decision_tree_recall, decision_tree_f1, decision_tree_auc]
pd.DataFrame(decision_tree_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Decision Tree"]).T


                   precision    recall  f1-score   support

             arts       0.92      0.93      0.93       198
               hr       0.83      0.86      0.85       181
            sales       0.84      0.79      0.81       197
software+engineer       0.87      0.89      0.88       220

         accuracy                           0.87       796
        macro avg       0.87      0.87      0.87       796
     weighted avg       0.87      0.87      0.87       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Decision Tree,0.866834,0.865838,0.866084,0.865675,0.917378


### 7. Random Forest

In [11]:
random_forest_model = RandomForestClassifier(random_state = 0)
random_forest_model.fit(X_train, y_train_encoded)

random_forest_model_y_pred = random_forest_model.predict(X_test)
random_forest_model_y_pred_prob = random_forest_model.predict_proba(X_test)

print(classification_report(y_test_encoded, random_forest_model_y_pred, target_names = label_encoder.classes_))
random_forest_accuracy = accuracy_score(y_test_encoded, random_forest_model_y_pred)
random_forest_precision = precision_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_recall = recall_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_f1 = f1_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_auc = roc_auc_score(y_test_encoded, random_forest_model_y_pred_prob, multi_class = "ovo")
random_forest_metrics = [random_forest_accuracy, random_forest_precision, random_forest_recall, random_forest_f1, random_forest_auc]
pd.DataFrame(random_forest_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Random Forest"]).T


                   precision    recall  f1-score   support

             arts       0.99      0.97      0.98       198
               hr       0.94      0.93      0.93       181
            sales       0.91      0.94      0.92       197
software+engineer       0.99      0.99      0.99       220

         accuracy                           0.96       796
        macro avg       0.96      0.96      0.96       796
     weighted avg       0.96      0.96      0.96       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Random Forest,0.957286,0.956483,0.955831,0.956072,0.995741


### 8. XGBoost

In [12]:
xgboost_model = xgb.XGBClassifier(random_state = 0)
xgboost_model.fit(X_train, y_train_encoded)

xgboost_model_y_pred = xgboost_model.predict(X_test)
xgboost_model_y_pred_prob = xgboost_model.predict_proba(X_test)

print(classification_report(y_test_encoded, xgboost_model_y_pred, target_names = label_encoder.classes_))
xgboost_accuracy = accuracy_score(y_test_encoded, xgboost_model_y_pred)
xgboost_precision = precision_score(y_test_encoded, xgboost_model_y_pred, average = "macro")
xgboost_recall = recall_score(y_test_encoded, xgboost_model_y_pred, average = "macro")
xgboost_f1 = f1_score(y_test_encoded, xgboost_model_y_pred, average = "macro")
xgboost_auc = roc_auc_score(y_test_encoded, xgboost_model_y_pred_prob, multi_class = "ovo")
xgboost_metrics = [random_forest_accuracy, xgboost_precision, xgboost_recall, xgboost_f1, xgboost_auc]
pd.DataFrame(xgboost_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["XGBoost"]).T

                   precision    recall  f1-score   support

             arts       0.97      0.98      0.98       198
               hr       0.92      0.95      0.93       181
            sales       0.95      0.94      0.95       197
software+engineer       0.99      0.97      0.98       220

         accuracy                           0.96       796
        macro avg       0.96      0.96      0.96       796
     weighted avg       0.96      0.96      0.96       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
XGBoost,0.957286,0.959752,0.960472,0.960014,0.998305


# Models Summary

In [None]:
all_metrics = [logistic_regression_metrics, svm_metrics, adaboost_metrics, gradient_boosting_metrics, gaussiannb_metrics, decision_tree_metrics, random_forest_metrics, xgboost_metrics]
metrics_summary = pd.DataFrame(all_metrics, columns = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], index = ["Logistic Regression", "SVM", "Adaboost", "Gradient Boosting", "Gaussian NB", "Decision Tree", "Random Forest", "XGBoost"])
metrics_summary

Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Logistic Regression,0.959799,0.958767,0.959089,0.958913,0.996554
SVM,0.969849,0.969052,0.969545,0.96924,0.995859
Adaboost,0.894472,0.899341,0.892158,0.893819,0.96785
Gradient Boosting,0.964824,0.963735,0.964244,0.963953,0.998118
Gaussian NB,0.918342,0.917625,0.916914,0.916755,0.985512
Decision Tree,0.866834,0.865838,0.866084,0.865675,0.917378
Random Forest,0.957286,0.956483,0.955831,0.956072,0.995741


## Model Ensemble

In [14]:
import modelEnsemble
import importlib
importlib.reload(modelEnsemble)
estimators = [('logreg', logistic_regression_model), ('svm', svm_model), ('gradboost', gradient_boosting_model)]
vs = modelEnsemble.modelEnsemble(estimators, X_train, y_train_encoded, X_test, y_test_encoded)

In [15]:
vsreport = vs.voting()
vsreport

Unnamed: 0,Accuracy,Precision,F1-Measure
Voting-Hard,0.967337,0.96653,0.966734
Voting-Soft,0.96608,0.965247,0.965419


In [16]:
vsreport = vs.stacking(svm_model)
vsreport