In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('data.csv')
all_y = data['SOC Job Title'] #Every Category
labs, counts = np.unique(all_y, return_counts=True) 
#Bins and the number of occurances of the bins
labs, counts 

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
 array([  3,  22,   5,  89,  31,  37,  77,  55,   3, 128,  17,  27,  12,
          5,  15,  36,  28,   6, 332, 100,   8,   6,  16,  18,  32, 132,
         37,  22]))

In [7]:
flt_labs = labs[counts > 25] #If bin is used more than 25 times
flt_data = data[np.isin(all_y,flt_labs)] #Data of qualifying rows

array([ 3,  4,  5,  6,  7,  9, 11, 15, 16, 18, 19, 24, 25, 26])

In [10]:
X = flt_data.iloc[:,:-1].values #Data
y = flt_data.iloc[:,-1].values #Bin number

In [13]:
trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.3)
trainX.shape, trainY.shape, testX.shape, testY.shape

((798, 512), (798,), (343, 512), (343,))

<h3>Random Forest</h3>

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [{
    'n_estimators' : [20, 50, 100],
    'min_samples_split' : [10, 20, 30],
    'min_samples_leaf' : [10, 20, 30]
}]

rf = RandomForestClassifier(class_weight="balanced_subsample")

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(trainX,trainY)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced_subsample'),
             param_grid=[{'min_samples_leaf': [10, 20, 30],
                          'min_samples_split': [10, 20, 30],
                          'n_estimators': [20, 50, 100]}],
             return_train_score=True, scoring='accuracy')

In [19]:
grid_search.best_score_

0.6515959119496856

In [20]:
best_rf = grid_search.best_estimator_
best_rf.score(testX, testY)

0.685131195335277

In [None]:
##f1
from sklearn.metrics import f1_score

f1_score(testY, predY_test, average='macro')

<h3>Neural Network</h3>

In [50]:
from sklearn.neural_network import MLPClassifier

n_features = trainX.shape[0]

param_grid = [{
    'hidden_layer_sizes' : [[n_features,n_features],                       #two hidden layer with n_features neurons
                            [n_features,n_features,n_features],            #three hidden layer with n_features neurons 
                            [n_features//2,n_features//2],                 #two hidden layer with n_features/2 neurons
                            [n_features//2,n_features//2,n_features//2],   #three hidden layer with n_features/2 neurons
                            [n_features*2,n_features*2],                   #two hidden layer with n_features*2 neurons
                            [n_features*2,n_features*2,n_features*2]],     #three hidden layer with n_features*2 neurons
    'alpha' : [0.001, 0.01, 0.1, 1, 10]                                    #regularization terms
}]

mlp = MLPClassifier(max_iter=1000)

grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', return_train_score=True)

grid_search.fit(trainX,trainY)

GridSearchCV(cv=3, estimator=MLPClassifier(max_iter=1000),
             param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10],
                          'hidden_layer_sizes': [[798, 798], [798, 798, 798],
                                                 [399, 399], [399, 399, 399],
                                                 [1596, 1596],
                                                 [1596, 1596, 1596]]}],
             return_train_score=True, scoring='accuracy')

In [51]:
grid_search.best_score_

0.7167919799498746

In [52]:
best_mlp = grid_search.best_estimator_
best_mlp.score(testX, testY)

0.7026239067055393

In [73]:
f1_score(testY, predY_test, average='macro')

0.6906096762949474

<h3>SVM</h3>

In [79]:
from sklearn.svm import SVC

svc = SVC()

param_grid = [{
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel' : ['poly','rbf'],
    'degree' : [2, 3],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}]

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(trainX,trainY)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [0.01, 0.1, 1, 10, 100], 'degree': [2, 3],
                          'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'kernel': ['poly', 'rbf']}],
             return_train_score=True, scoring='accuracy')

In [80]:
grid_search.best_score_

0.7293867924528302

In [81]:
best_svm1 = grid_search.best_estimator_
best_svm1.score(testX, testY)

0.7055393586005831

In [83]:
f1_score(testY, predY_test, average='macro')

0.731279759119256

<h4>SVM with Weighted Labels</h4>

In [62]:
svc = SVC(class_weight="balanced")

param_grid = [{
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel' : ['poly','rbf'],
    'degree' : [2, 3],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}]

grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(trainX,trainY)

GridSearchCV(cv=5, estimator=SVC(class_weight='balanced'),
             param_grid=[{'C': [0.01, 0.1, 1, 10, 100], 'degree': [2, 3],
                          'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'kernel': ['poly', 'rbf']}],
             return_train_score=True, scoring='accuracy')

In [63]:
grid_search.best_score_

0.7281446540880503

In [76]:
best_svm2 = grid_search.best_estimator_
best_svm2.score(testX, testY)

0.6967930029154519

In [78]:
f1_score(testY, predY_test, average='macro')

0.7205669605786553