# Installing Libraries

In [83]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

# for the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# for evaluation metrics of the models
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_score, recall_score, f1_score

# Reading Dataset

### Reading Train Dataset

In [84]:
train_data = pd.read_csv("bag_easy_train.csv")
X_train = train_data.drop(["category"], axis =  "columns")
y_train = train_data[['category']]
train_data.head()

Unnamed: 0,category,Human,’,Development,design,Work,Computer,creative,account,communication,...,Requirements,technical,Excellent,duty,market,knowledge,technology,business,application,matter
0,software+engineer,0.0,0.0,0.0,0.222604,0.0,0.0,0.0,0.0,0.0,...,0.0,0.281375,0.0,0.0,0.0,0.0,0.0,0.198413,0.11879,0.0
1,arts,0.0,0.0,0.0,0.102594,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.092776,0.0,0.0,0.0,0.0
2,hr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08689,0.122337
3,arts,0.0,0.0,0.0,0.393433,0.0,0.0,0.113493,0.0,0.268595,...,0.0,0.0,0.0,0.0,0.0,0.071156,0.0,0.0,0.0,0.0
4,arts,0.0,0.0,0.0,0.206635,0.0,0.0,0.149018,0.0,0.088168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.118194,0.0,0.0,0.077626


### Reading Test Dataset

In [85]:
test_data = pd.read_csv("bag_easy_test.csv")
X_test = test_data.drop(["category"], axis =  "columns")
y_test = test_data[['category']]
test_data.head()

Unnamed: 0,category,Human,’,Development,design,Work,Computer,creative,account,communication,...,Requirements,technical,Excellent,duty,market,knowledge,technology,business,application,matter
0,software+engineer,0.0,0.0,0.0,0.504831,0.0,0.0,0.0,0.0,0.089474,...,0.0,0.0,0.0,0.0,0.0,0.370597,0.0,0.0,0.0,0.0
1,sales,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284994,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164661,0.0,0.136495
2,sales,0.0,0.0,0.0,0.0,0.0,0.0,0.071549,0.0,0.083948,...,0.0,0.0,0.0,0.0,0.199817,0.043464,0.0,0.089533,0.0,0.0
3,sales,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.239945,0.129985,...,0.0,0.0,0.0,0.210654,0.206265,0.0,0.0,0.277267,0.0,0.0
4,arts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.366191,...,0.0,0.0,0.0,0.0,0.0,0.068943,0.389033,0.07101,0.0,0.058863


# Encode Y values

Encode the string labels in y_train and y_test to numerical labels using the LabelEncoder class

In [86]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


# Models
1. Logistic Regression 
2. SVM ('rbf') 
3. SVM ('linear')
4. Adaboost
5. Gradient Boosting
6. Gaussian NB
7. Decision Tree 
8. Random forest 

accuracy, precision, recall, f1, auc

### 1. Logistic Regression

In [87]:
logistic_regression_model = LogisticRegression(random_state = 0)
logistic_regression_model.fit(X_train, y_train_encoded)

logistic_regression_model_y_pred = logistic_regression_model.predict(X_test)
logistic_regression_model_y_pred_prob = logistic_regression_model.predict_proba(X_test)

print(classification_report(y_test_encoded, logistic_regression_model_y_pred, target_names = label_encoder.classes_))
logistic_regression_accuracy = accuracy_score(y_test_encoded, logistic_regression_model_y_pred)
logistic_regression_precision = precision_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_recall = recall_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_f1 = f1_score(y_test_encoded, logistic_regression_model_y_pred, average = "macro")
logistic_regression_auc = roc_auc_score(y_test_encoded, logistic_regression_model_y_pred_prob, multi_class = "ovo")
logistic_regression_metrics = [logistic_regression_accuracy, logistic_regression_precision, logistic_regression_recall, logistic_regression_f1, logistic_regression_auc]
pd.DataFrame(logistic_regression_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Logistic Regression"]).T


                   precision    recall  f1-score   support

             arts       0.97      0.98      0.97       198
               hr       0.92      0.96      0.94       181
            sales       0.96      0.92      0.94       197
software+engineer       0.98      0.98      0.98       220

         accuracy                           0.96       796
        macro avg       0.96      0.96      0.96       796
     weighted avg       0.96      0.96      0.96       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Logistic Regression,0.958543,0.957405,0.957913,0.957467,0.99492


### 2. SVM ('rbf')

In [88]:
svm_model = SVC(random_state = 0, kernel = "rbf", probability = True)
svm_model.fit(X_train, y_train_encoded)

svm_model_y_pred = svm_model.predict(X_test)
svm_model_y_pred_prob = svm_model.predict_proba(X_test)

print(classification_report(y_test_encoded, svm_model_y_pred, target_names = label_encoder.classes_))
svm_accuracy = accuracy_score(y_test_encoded, svm_model_y_pred)
svm_precision = precision_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_recall = recall_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_f1 = f1_score(y_test_encoded, svm_model_y_pred, average = "macro")
svm_auc = roc_auc_score(y_test_encoded, svm_model_y_pred_prob, multi_class = "ovo")
svm_metrics = [svm_accuracy, svm_precision, svm_recall, svm_f1, svm_auc]
pd.DataFrame(svm_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["SVM 'rbf'"]).T


                   precision    recall  f1-score   support

             arts       0.98      0.98      0.98       198
               hr       0.94      0.98      0.96       181
            sales       0.98      0.94      0.96       197
software+engineer       0.99      0.99      0.99       220

         accuracy                           0.97       796
        macro avg       0.97      0.97      0.97       796
     weighted avg       0.97      0.97      0.97       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
SVM 'rbf',0.973618,0.972849,0.973431,0.972894,0.997847


### 3. SVM ('linear')

In [89]:
svm_linear_model = SVC(random_state = 0, kernel = "linear", probability = True)
svm_linear_model.fit(X_train, y_train_encoded)

svm_linear_model_y_pred = svm_linear_model.predict(X_test)
svm_linear_model_y_pred_prob = svm_linear_model.predict_proba(X_test)

print(classification_report(y_test_encoded, svm_linear_model_y_pred, target_names = label_encoder.classes_))
svm_linear_accuracy = accuracy_score(y_test_encoded, svm_linear_model_y_pred)
svm_linear_precision = precision_score(y_test_encoded, svm_linear_model_y_pred, average = "macro")
svm_linear_recall = recall_score(y_test_encoded, svm_linear_model_y_pred, average = "macro")
svm_linear_f1 = f1_score(y_test_encoded, svm_linear_model_y_pred, average = "macro")
svm_linear_auc = roc_auc_score(y_test_encoded, svm_linear_model_y_pred_prob, multi_class = "ovo")
svm_linear_metrics = [svm_linear_accuracy, svm_linear_precision, svm_linear_recall, svm_linear_f1, svm_linear_auc]
pd.DataFrame(svm_linear_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["SVM 'linear'"]).T


                   precision    recall  f1-score   support

             arts       0.96      0.98      0.97       198
               hr       0.96      0.96      0.96       181
            sales       0.95      0.93      0.94       197
software+engineer       0.98      0.98      0.98       220

         accuracy                           0.96       796
        macro avg       0.96      0.96      0.96       796
     weighted avg       0.96      0.96      0.96       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
SVM 'linear',0.963568,0.963063,0.962857,0.962908,0.994309


### 4. Adaboost

In [90]:
adaboost_model = AdaBoostClassifier(random_state = 0)
adaboost_model.fit(X_train, y_train_encoded)

adaboost_model_y_pred = adaboost_model.predict(X_test)
adaboost_model_y_pred_prob = adaboost_model.predict_proba(X_test)

print(classification_report(y_test_encoded, adaboost_model_y_pred, target_names = label_encoder.classes_))
adaboost_accuracy = accuracy_score(y_test_encoded, adaboost_model_y_pred)
adaboost_precision = precision_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_recall = recall_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_f1 = f1_score(y_test_encoded, adaboost_model_y_pred, average = "macro")
adaboost_auc = roc_auc_score(y_test_encoded, adaboost_model_y_pred_prob, multi_class = "ovo")
adaboost_metrics = [adaboost_accuracy, adaboost_precision, adaboost_recall, adaboost_f1, adaboost_auc]
pd.DataFrame(adaboost_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Adaboost"]).T


                   precision    recall  f1-score   support

             arts       0.98      0.75      0.85       198
               hr       0.87      0.92      0.89       181
            sales       0.81      0.91      0.86       197
software+engineer       0.91      0.96      0.94       220

         accuracy                           0.89       796
        macro avg       0.89      0.89      0.88       796
     weighted avg       0.89      0.89      0.89       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Adaboost,0.886935,0.893151,0.885724,0.884672,0.916769


### 5. Gradient Boosting

In [91]:
gradient_boosting_model = GradientBoostingClassifier(random_state = 0)
gradient_boosting_model.fit(X_train, y_train_encoded)

gradient_boosting_model_y_pred = gradient_boosting_model.predict(X_test)
gradient_boosting_model_y_pred_prob = gradient_boosting_model.predict_proba(X_test)

print(classification_report(y_test_encoded, gradient_boosting_model_y_pred, target_names = label_encoder.classes_))
gradient_boosting_accuracy = accuracy_score(y_test_encoded, gradient_boosting_model_y_pred)
gradient_boosting_precision = precision_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_recall = recall_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_f1 = f1_score(y_test_encoded, gradient_boosting_model_y_pred, average = "macro")
gradient_boosting_auc = roc_auc_score(y_test_encoded, gradient_boosting_model_y_pred_prob, multi_class = "ovo")
gradient_boosting_metrics = [gradient_boosting_accuracy, gradient_boosting_precision, gradient_boosting_recall, gradient_boosting_f1, gradient_boosting_auc]
pd.DataFrame(gradient_boosting_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Gradient Boosting"]).T


                   precision    recall  f1-score   support

             arts       0.98      0.97      0.97       198
               hr       0.93      0.97      0.95       181
            sales       0.95      0.93      0.94       197
software+engineer       0.98      0.97      0.98       220

         accuracy                           0.96       796
        macro avg       0.96      0.96      0.96       796
     weighted avg       0.96      0.96      0.96       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Gradient Boosting,0.961055,0.960171,0.960933,0.960362,0.996753


### 6. Gaussian NB

In [92]:
gaussiannb_model = GaussianNB()
gaussiannb_model.fit(X_train, y_train_encoded)

gaussiannb_model_y_pred = gaussiannb_model.predict(X_test)
gaussiannb_model_y_pred_prob = gaussiannb_model.predict_proba(X_test)

print(classification_report(y_test_encoded, gaussiannb_model_y_pred, target_names = label_encoder.classes_))
gaussiannb_accuracy = accuracy_score(y_test_encoded, gaussiannb_model_y_pred)
gaussiannb_precision = precision_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_recall = recall_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_f1 = f1_score(y_test_encoded, gaussiannb_model_y_pred, average = "macro")
gaussiannb_auc = roc_auc_score(y_test_encoded, gaussiannb_model_y_pred_prob, multi_class = "ovo")
gaussiannb_metrics = [gaussiannb_accuracy, gaussiannb_precision, gaussiannb_recall, gaussiannb_f1, gaussiannb_auc]
pd.DataFrame(gaussiannb_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Gaussian NB"]).T


                   precision    recall  f1-score   support

             arts       0.96      0.89      0.93       198
               hr       0.76      0.95      0.85       181
            sales       0.92      0.81      0.86       197
software+engineer       0.97      0.94      0.96       220

         accuracy                           0.90       796
        macro avg       0.90      0.90      0.90       796
     weighted avg       0.91      0.90      0.90       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Gaussian NB,0.898241,0.903481,0.898058,0.896872,0.982774


### 7. Decision Tree

In [93]:
decision_tree_model = DecisionTreeClassifier(random_state = 0)
decision_tree_model.fit(X_train, y_train_encoded)

decision_tree_model_y_pred = decision_tree_model.predict(X_test)
decision_tree_model_y_pred_prob = decision_tree_model.predict_proba(X_test)

print(classification_report(y_test_encoded, decision_tree_model_y_pred, target_names = label_encoder.classes_))
decision_tree_accuracy = accuracy_score(y_test_encoded, decision_tree_model_y_pred)
decision_tree_precision = precision_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_recall = recall_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_f1 = f1_score(y_test_encoded, decision_tree_model_y_pred, average = "macro")
decision_tree_auc = roc_auc_score(y_test_encoded, decision_tree_model_y_pred_prob, multi_class = "ovo")
decision_tree_metrics = [decision_tree_accuracy, decision_tree_precision, decision_tree_recall, decision_tree_f1, decision_tree_auc]
pd.DataFrame(decision_tree_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Decision Tree"]).T


                   precision    recall  f1-score   support

             arts       0.96      0.89      0.92       198
               hr       0.79      0.91      0.85       181
            sales       0.91      0.84      0.87       197
software+engineer       0.93      0.93      0.93       220

         accuracy                           0.89       796
        macro avg       0.90      0.89      0.89       796
     weighted avg       0.90      0.89      0.90       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Decision Tree,0.894472,0.896359,0.893731,0.893289,0.935528


### 8. Random Forest

In [94]:
random_forest_model = RandomForestClassifier(random_state = 0)
random_forest_model.fit(X_train, y_train_encoded)

random_forest_model_y_pred = random_forest_model.predict(X_test)
random_forest_model_y_pred_prob = random_forest_model.predict_proba(X_test)

print(classification_report(y_test_encoded, random_forest_model_y_pred, target_names = label_encoder.classes_))
random_forest_accuracy = accuracy_score(y_test_encoded, random_forest_model_y_pred)
random_forest_precision = precision_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_recall = recall_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_f1 = f1_score(y_test_encoded, random_forest_model_y_pred, average = "macro")
random_forest_auc = roc_auc_score(y_test_encoded, random_forest_model_y_pred_prob, multi_class = "ovo")
random_forest_metrics = [random_forest_accuracy, random_forest_precision, random_forest_recall, random_forest_f1, random_forest_auc]
pd.DataFrame(random_forest_metrics, index = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], columns = ["Random Forest"]).T


                   precision    recall  f1-score   support

             arts       0.99      0.98      0.99       198
               hr       0.94      0.97      0.96       181
            sales       0.96      0.94      0.95       197
software+engineer       0.99      1.00      0.99       220

         accuracy                           0.97       796
        macro avg       0.97      0.97      0.97       796
     weighted avg       0.97      0.97      0.97       796



Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Random Forest,0.973618,0.972682,0.972948,0.972711,0.997699


# Models Summary

In [95]:
all_metrics = [logistic_regression_metrics, svm_metrics, svm_linear_metrics, adaboost_metrics, gradient_boosting_metrics, gaussiannb_metrics, decision_tree_metrics, random_forest_metrics]
metrics_summary = pd.DataFrame(all_metrics, columns = ["Accuracy", "Precision", "Recall", "F1-Measure", "AUC"], index = ["Logistic Regression", "SVM 'rbf'", "SVM 'linear'", "Adaboost", "Gradient Boosting", "Gaussian NB", "Decision Tree", "Random Forest"])
metrics_summary

Unnamed: 0,Accuracy,Precision,Recall,F1-Measure,AUC
Logistic Regression,0.958543,0.957405,0.957913,0.957467,0.99492
SVM 'rbf',0.973618,0.972849,0.973431,0.972894,0.997847
SVM 'linear',0.963568,0.963063,0.962857,0.962908,0.994309
Adaboost,0.886935,0.893151,0.885724,0.884672,0.916769
Gradient Boosting,0.961055,0.960171,0.960933,0.960362,0.996753
Gaussian NB,0.898241,0.903481,0.898058,0.896872,0.982774
Decision Tree,0.894472,0.896359,0.893731,0.893289,0.935528
Random Forest,0.973618,0.972682,0.972948,0.972711,0.997699
