In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [16]:
data = pd.read_csv('data.csv')
all_y = data['SOC Job Title']
labs, counts = np.unique(all_y, return_counts=True)

In [17]:
flt_labs = labs[counts > 25]
flt_data = data[np.isin(all_y,flt_labs)]

In [22]:
X = flt_data.iloc[:,:-1].values
y = flt_data.iloc[:,-1].values

In [37]:
trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.3)
trainX.shape, trainY.shape, testX.shape, testY.shape

((798, 512), (798,), (343, 512), (343,))

<h3>Random Forest</h3>

In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [{
    'n_estimators' : [20, 50, 100],
    'min_samples_split' : [10, 20, 30],
    'min_samples_leaf' : [10, 20, 30]
}]

rf = RandomForestClassifier(class_weight="balanced_subsample")

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(trainX,trainY)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced_subsample'),
             param_grid=[{'min_samples_leaf': [10, 20, 30],
                          'min_samples_split': [10, 20, 30],
                          'n_estimators': [20, 50, 100]}],
             return_train_score=True, scoring='accuracy')

In [92]:
grid_search.best_score_

0.675503144654088

In [93]:
best_rf = grid_search.best_estimator_
best_rf.score(testX, testY)

0.6676384839650146

In [89]:
predY_test = best_rf.predict(testX)
np.unique(predY_test, return_counts=True)

(array([ 3,  4,  5,  6,  7,  9, 11, 15, 16, 18, 19, 24, 25, 26]),
 array([19, 11, 12, 22, 18, 46,  7, 18,  5, 79, 36, 22, 43,  5]))

In [90]:
np.unique(testY, return_counts=True)

(array([ 3,  4,  5,  6,  7,  9, 11, 15, 16, 18, 19, 24, 25, 26]),
 array([23, 15, 10, 22, 17, 41,  7, 10,  6, 97, 35, 12, 41,  7]))

In [71]:
##f1
from sklearn.metrics import f1_score

f1_score(testY, predY_test, average='macro')

0.6653536298128918

<h3>Neural Network</h3>

In [50]:
from sklearn.neural_network import MLPClassifier

n_features = trainX.shape[0]

param_grid = [{
    'hidden_layer_sizes' : [[n_features,n_features],                       #two hidden layer with n_features neurons
                            [n_features,n_features,n_features],            #three hidden layer with n_features neurons 
                            [n_features//2,n_features//2],                 #two hidden layer with n_features/2 neurons
                            [n_features//2,n_features//2,n_features//2],   #three hidden layer with n_features/2 neurons
                            [n_features*2,n_features*2],                   #two hidden layer with n_features*2 neurons
                            [n_features*2,n_features*2,n_features*2]],     #three hidden layer with n_features*2 neurons
    'alpha' : [0.001, 0.01, 0.1, 1, 10]                                    #regularization terms
}]

mlp = MLPClassifier(max_iter=1000)

grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', return_train_score=True)

grid_search.fit(trainX,trainY)

GridSearchCV(cv=3, estimator=MLPClassifier(max_iter=1000),
             param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10],
                          'hidden_layer_sizes': [[798, 798], [798, 798, 798],
                                                 [399, 399], [399, 399, 399],
                                                 [1596, 1596],
                                                 [1596, 1596, 1596]]}],
             return_train_score=True, scoring='accuracy')

In [51]:
grid_search.best_score_

0.7167919799498746

In [52]:
best_mlp = grid_search.best_estimator_
best_mlp.score(testX, testY)

0.7026239067055393

In [72]:
predY_test = best_mlp.predict(testX)
np.unique(predY_test, return_counts=True)

(array([ 3,  4,  5,  6,  7,  9, 11, 15, 16, 18, 19, 24, 25, 26]),
 array([ 21,  11,   8,  15,  14,  36,   6,   7,   5, 124,  34,  10,  48,
          4]))

In [73]:
f1_score(testY, predY_test, average='macro')

0.6906096762949474

<h3>SVM</h3>

In [79]:
from sklearn.svm import SVC

svc = SVC()

param_grid = [{
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel' : ['poly','rbf'],
    'degree' : [2, 3],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}]

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(trainX,trainY)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [0.01, 0.1, 1, 10, 100], 'degree': [2, 3],
                          'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'kernel': ['poly', 'rbf']}],
             return_train_score=True, scoring='accuracy')

In [80]:
grid_search.best_score_

0.7293867924528302

In [81]:
best_svm1 = grid_search.best_estimator_
best_svm1.score(testX, testY)

0.7055393586005831

In [82]:
predY_test = best_svm1.predict(testX)
np.unique(predY_test, return_counts=True)

(array([ 3,  4,  5,  6,  7,  9, 11, 15, 16, 18, 19, 24, 25, 26]),
 array([ 24,  10,   7,  18,  15,  46,   7,   9,   5, 114,  31,  13,  39,
          5]))

In [83]:
f1_score(testY, predY_test, average='macro')

0.731279759119256

<h4>SVM with Weighted Labels</h4>

In [62]:
svc = SVC(class_weight="balanced")

param_grid = [{
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel' : ['poly','rbf'],
    'degree' : [2, 3],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}]

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(trainX,trainY)

GridSearchCV(cv=5, estimator=SVC(class_weight='balanced'),
             param_grid=[{'C': [0.01, 0.1, 1, 10, 100], 'degree': [2, 3],
                          'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'kernel': ['poly', 'rbf']}],
             return_train_score=True, scoring='accuracy')

In [63]:
grid_search.best_score_

0.7281446540880503

In [76]:
best_svm2 = grid_search.best_estimator_
best_svm2.score(testX, testY)

0.6967930029154519

In [77]:
predY_test = best_svm2.predict(testX)
np.unique(predY_test, return_counts=True)

(array([ 3,  4,  5,  6,  7,  9, 11, 15, 16, 18, 19, 24, 25, 26]),
 array([ 23,  10,   7,  20,  15,  45,   7,   9,   5, 108,  32,  17,  40,
          5]))

In [78]:
f1_score(testY, predY_test, average='macro')

0.7205669605786553