# Imports

In [1]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import datasets, svm, tree, metrics
from sklearn.cross_decomposition import CCA
from sklearn import preprocessing
import matplotlib.pyplot as plt
from pandas import DataFrame
import numpy as np
import itertools
import random
import json
import sys
import os.path

# Load data set into script

In [156]:
with open('input_data.json') as data:
    jdata = json.load(data)

# Extract Data

In [4]:
dictionary = []
rows = []
index = []
for problem in jdata:
    for submissions in problem['Submissions']:
        for submission in submissions:
            code = submissions[submission]['Code'][0]
            dictionary.append(code)
            if problem['tags'] != []:
                rows.append({'code':code, 'labels':problem['tags']})
                index.append(submission)

data = DataFrame(rows, index=index)

# Split data into a training and testing set

In [25]:
vectorizer = CountVectorizer()
labels_vect = CountVectorizer()

BoW = vectorizer.fit_transform(dictionary)

features = vectorizer.transform(data['code'].values)
targets = data['labels'].values

# create traing and testing data
seed = 42
data_train, data_test = train_test_split(data,test_size=0.20, random_state=seed)
data_train.to_csv('out.csv')
x_train = vectorizer.transform(data_train['code'].values)
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(data_train['labels'].values)
x_test = vectorizer.transform(data_test['code'].values)
y_test = MultiLabelBinarizer().fit_transform(data_test['labels'].values)

['binary search' 'brute force' 'constructive algorithms' 'data structures'
 'dfs and similar' 'dp' 'games' 'geometry' 'graphs' 'greedy'
 'implementation' 'math' 'number theory' 'sortings' 'strings'
 'two pointers']


# Create Machine Learning Algorithms

In [19]:
svm_classifier = OneVsRestClassifier(svm.SVC(kernel='rbf', class_weight='balanced', C=1, gamma=1))
dt_classifier = tree.DecisionTreeClassifier(class_weight='balanced')
rf_classifier = RandomForestClassifier(n_estimators=10,class_weight='balanced')
nn_classifier = MLPClassifier(hidden_layer_sizes=(100,70),max_iter=500)

In [105]:
x_train.shape

(2913, 3129)

In [120]:
y_train.shape

(2913, 16)

In [153]:
svm_classifier.fit(x_train,y_train)
dt_classifier.fit(x_train,y_train)
rf_classifier.fit(x_train,y_train)
nn_classifier.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 70), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [154]:
score = svm_classifier.score(x_test, y_test)
print('SVM score: ' + str(score))
score = dt_classifier.score(x_test, y_test)
print('DT score: ' + str(score))
score = rf_classifier.score(x_test, y_test)
print('RF score: ' + str(score))
score = nn_classifier.score(x_test, y_test)
print('NN score: ' + str(score))

SVM score: 0.5775034293552812
DT score: 0.7901234567901234
RF score: 0.6941015089163237
NN score: 0.6982167352537723


In [155]:
svm_y_pred = svm_classifier.predict(x_test)
avg = 0
for i in range(y_test.shape[1]):
    print('-------------------------------')
    print(mlb.classes_[i])
    display(metrics.confusion_matrix(y_test[:,i], svm_y_pred[:,i]))
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test[:,i], svm_y_pred[:,i],average='binary')
    avg = avg + fbeta_score
    print('precision: ' + str(precision))
    print('recall: ' + str(recall))
    print('fbeta_score' + str(fbeta_score))
print('average f1: ' + str(avg/y_test.shape[1]))

-------------------------------
binary search


array([[720,   0],
       [  8,   1]])

precision: 1.0
recall: 0.1111111111111111
fbeta_score0.19999999999999998
-------------------------------
brute force


array([[649,   0],
       [ 57,  23]])

precision: 1.0
recall: 0.2875
fbeta_score0.44660194174757273
-------------------------------
constructive algorithms


array([[696,   0],
       [ 29,   4]])

precision: 1.0
recall: 0.12121212121212122
fbeta_score0.21621621621621626
-------------------------------
data structures


array([[724,   0],
       [  5,   0]])

precision: 0.0
recall: 0.0
fbeta_score0.0
-------------------------------
dfs and similar


  'precision', 'predicted', average, warn_for)


array([[718,   0],
       [ 11,   0]])

precision: 0.0
recall: 0.0
fbeta_score0.0
-------------------------------
dp


array([[726,   0],
       [  3,   0]])

precision: 0.0
recall: 0.0
fbeta_score0.0
-------------------------------
games


array([[696,   0],
       [  4,  29]])

precision: 1.0
recall: 0.8787878787878788
fbeta_score0.9354838709677419
-------------------------------
geometry


array([[723,   0],
       [  5,   1]])

precision: 1.0
recall: 0.16666666666666666
fbeta_score0.2857142857142857
-------------------------------
graphs


array([[720,   0],
       [  9,   0]])

precision: 0.0
recall: 0.0
fbeta_score0.0
-------------------------------
greedy


array([[605,   0],
       [113,  11]])

precision: 1.0
recall: 0.08870967741935484
fbeta_score0.16296296296296298
-------------------------------
implementation


array([[150, 176],
       [  2, 401]])

precision: 0.6949740034662045
recall: 0.9950372208436724
fbeta_score0.8183673469387754
-------------------------------
math


array([[516,   1],
       [101, 111]])

precision: 0.9910714285714286
recall: 0.5235849056603774
fbeta_score0.6851851851851852
-------------------------------
number theory


array([[722,   0],
       [  6,   1]])

precision: 1.0
recall: 0.14285714285714285
fbeta_score0.25
-------------------------------
sortings


array([[699,   0],
       [ 26,   4]])

precision: 1.0
recall: 0.13333333333333333
fbeta_score0.23529411764705882
-------------------------------
strings


array([[675,   0],
       [ 53,   1]])

precision: 1.0
recall: 0.018518518518518517
fbeta_score0.03636363636363636
-------------------------------
two pointers


array([[717,   0],
       [ 11,   1]])

precision: 1.0
recall: 0.08333333333333333
fbeta_score0.15384615384615385
average f1: 0.27662723234934933


In [150]:
dt_y_pred = dt_classifier.predict(x_test)
avg = 0
for i in range(y_test.shape[1]):
    print('-------------------------------')
    print(mlb.classes_[i])
    display(metrics.confusion_matrix(y_test[:,i], dt_y_pred[:,i]))
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test[:,i], dt_y_pred[:,i],average='binary')
    avg = avg + fbeta_score
    print('precision: ' + str(precision))
    print('recall: ' + str(recall))
    print('fbeta_score' + str(fbeta_score))
print('average f1: ' + str(avg/y_test.shape[1]))

-------------------------------
binary search


array([[720,   0],
       [  3,   6]])

precision: 1.0
recall: 0.6666666666666666
fbeta_score0.8
-------------------------------
brute force


array([[625,  24],
       [ 11,  69]])

precision: 0.7419354838709677
recall: 0.8625
fbeta_score0.7976878612716763
-------------------------------
constructive algorithms


array([[690,   6],
       [  9,  24]])

precision: 0.8
recall: 0.7272727272727273
fbeta_score0.761904761904762
-------------------------------
data structures


array([[718,   6],
       [  2,   3]])

precision: 0.3333333333333333
recall: 0.6
fbeta_score0.42857142857142855
-------------------------------
dfs and similar


array([[717,   1],
       [  2,   9]])

precision: 0.9
recall: 0.8181818181818182
fbeta_score0.8571428571428572
-------------------------------
dp


array([[720,   6],
       [  2,   1]])

precision: 0.14285714285714285
recall: 0.3333333333333333
fbeta_score0.2
-------------------------------
games


array([[694,   2],
       [  0,  33]])

precision: 0.9428571428571428
recall: 1.0
fbeta_score0.9705882352941176
-------------------------------
geometry


array([[722,   1],
       [  1,   5]])

precision: 0.8333333333333334
recall: 0.8333333333333334
fbeta_score0.8333333333333334
-------------------------------
graphs


array([[719,   1],
       [  2,   7]])

precision: 0.875
recall: 0.7777777777777778
fbeta_score0.823529411764706
-------------------------------
greedy


array([[575,  30],
       [ 39,  85]])

precision: 0.7391304347826086
recall: 0.6854838709677419
fbeta_score0.7112970711297072
-------------------------------
implementation


array([[285,  41],
       [ 46, 357]])

precision: 0.8969849246231156
recall: 0.8858560794044665
fbeta_score0.8913857677902622
-------------------------------
math


array([[479,  38],
       [ 14, 198]])

precision: 0.8389830508474576
recall: 0.9339622641509434
fbeta_score0.8839285714285714
-------------------------------
number theory


array([[718,   4],
       [  1,   6]])

precision: 0.6
recall: 0.8571428571428571
fbeta_score0.7058823529411764
-------------------------------
sortings


array([[687,  12],
       [ 10,  20]])

precision: 0.625
recall: 0.6666666666666666
fbeta_score0.6451612903225806
-------------------------------
strings


array([[655,  20],
       [ 34,  20]])

precision: 0.5
recall: 0.37037037037037035
fbeta_score0.425531914893617
-------------------------------
two pointers


array([[712,   5],
       [  4,   8]])

precision: 0.6153846153846154
recall: 0.6666666666666666
fbeta_score0.64
average f1: 0.7109965536117996


In [151]:
rf_y_pred = rf_classifier.predict(x_test)
avg = 0
for i in range(y_test.shape[1]):
    print('-------------------------------')
    print(mlb.classes_[i])
    display(metrics.confusion_matrix(y_test[:,i], rf_y_pred[:,i]))
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test[:,i], rf_y_pred[:,i],average='binary')
    avg = avg + fbeta_score
    print('precision: ' + str(precision))
    print('recall: ' + str(recall))
    print('fbeta_score' + str(fbeta_score))
print('average f1: ' + str(avg/y_test.shape[1]))

-------------------------------
binary search


array([[720,   0],
       [  8,   1]])

precision: 1.0
recall: 0.1111111111111111
fbeta_score0.19999999999999998
-------------------------------
brute force


array([[647,   2],
       [ 24,  56]])

precision: 0.9655172413793104
recall: 0.7
fbeta_score0.8115942028985508
-------------------------------
constructive algorithms


array([[696,   0],
       [ 19,  14]])

precision: 1.0
recall: 0.42424242424242425
fbeta_score0.5957446808510638
-------------------------------
data structures


array([[724,   0],
       [  5,   0]])

precision: 0.0
recall: 0.0
fbeta_score0.0
-------------------------------
dfs and similar


  'precision', 'predicted', average, warn_for)


array([[718,   0],
       [  3,   8]])

precision: 1.0
recall: 0.7272727272727273
fbeta_score0.8421052631578948
-------------------------------
dp


array([[726,   0],
       [  3,   0]])

precision: 0.0
recall: 0.0
fbeta_score0.0
-------------------------------
games


array([[696,   0],
       [  2,  31]])

precision: 1.0
recall: 0.9393939393939394
fbeta_score0.96875
-------------------------------
geometry


array([[723,   0],
       [  3,   3]])

precision: 1.0
recall: 0.5
fbeta_score0.6666666666666666
-------------------------------
graphs


array([[720,   0],
       [  1,   8]])

precision: 1.0
recall: 0.8888888888888888
fbeta_score0.9411764705882353
-------------------------------
greedy


array([[600,   5],
       [ 68,  56]])

precision: 0.9180327868852459
recall: 0.45161290322580644
fbeta_score0.6054054054054054
-------------------------------
implementation


array([[280,  46],
       [ 39, 364]])

precision: 0.8878048780487805
recall: 0.9032258064516129
fbeta_score0.8954489544895449
-------------------------------
math


array([[512,   5],
       [ 38, 174]])

precision: 0.9720670391061452
recall: 0.8207547169811321
fbeta_score0.8900255754475704
-------------------------------
number theory


array([[722,   0],
       [  3,   4]])

precision: 1.0
recall: 0.5714285714285714
fbeta_score0.7272727272727273
-------------------------------
sortings


array([[698,   1],
       [ 17,  13]])

precision: 0.9285714285714286
recall: 0.43333333333333335
fbeta_score0.5909090909090909
-------------------------------
strings


array([[674,   1],
       [ 47,   7]])

precision: 0.875
recall: 0.12962962962962962
fbeta_score0.22580645161290322
-------------------------------
two pointers


array([[717,   0],
       [ 11,   1]])

precision: 1.0
recall: 0.08333333333333333
fbeta_score0.15384615384615385
average f1: 0.569671977696613


In [152]:
nn_y_pred = nn_classifier.predict(x_test)
avg = 0
for i in range(y_test.shape[1]):
    print('-------------------------------')
    print(mlb.classes_[i])
    display(metrics.confusion_matrix(y_test[:,i], nn_y_pred[:,i]))
    precision, recall, fbeta_score, support = metrics.precision_recall_fscore_support(y_test[:,i], nn_y_pred[:,i],average='binary')
    avg = avg + fbeta_score
    print('precision: ' + str(precision))
    print('recall: ' + str(recall))
    print('fbeta_score' + str(fbeta_score))
print('average f1: ' + str(avg/y_test.shape[1]))

-------------------------------
binary search


array([[718,   2],
       [  5,   4]])

precision: 0.6666666666666666
recall: 0.4444444444444444
fbeta_score0.5333333333333333
-------------------------------
brute force


array([[629,  20],
       [ 21,  59]])

precision: 0.7468354430379747
recall: 0.7375
fbeta_score0.7421383647798743
-------------------------------
constructive algorithms


array([[686,  10],
       [ 12,  21]])

precision: 0.6774193548387096
recall: 0.6363636363636364
fbeta_score0.65625
-------------------------------
data structures


array([[724,   0],
       [  3,   2]])

precision: 1.0
recall: 0.4
fbeta_score0.5714285714285715
-------------------------------
dfs and similar


array([[717,   1],
       [  3,   8]])

precision: 0.8888888888888888
recall: 0.7272727272727273
fbeta_score0.7999999999999999
-------------------------------
dp


array([[726,   0],
       [  3,   0]])

precision: 0.0
recall: 0.0
fbeta_score0.0
-------------------------------
games


  'precision', 'predicted', average, warn_for)


array([[696,   0],
       [  1,  32]])

precision: 1.0
recall: 0.9696969696969697
fbeta_score0.9846153846153847
-------------------------------
geometry


array([[723,   0],
       [  3,   3]])

precision: 1.0
recall: 0.5
fbeta_score0.6666666666666666
-------------------------------
graphs


array([[720,   0],
       [  3,   6]])

precision: 1.0
recall: 0.6666666666666666
fbeta_score0.8
-------------------------------
greedy


array([[575,  30],
       [ 56,  68]])

precision: 0.6938775510204082
recall: 0.5483870967741935
fbeta_score0.6126126126126127
-------------------------------
implementation


array([[256,  70],
       [ 47, 356]])

precision: 0.8356807511737089
recall: 0.8833746898263027
fbeta_score0.8588661037394452
-------------------------------
math


array([[491,  26],
       [ 34, 178]])

precision: 0.8725490196078431
recall: 0.839622641509434
fbeta_score0.8557692307692308
-------------------------------
number theory


array([[719,   3],
       [  1,   6]])

precision: 0.6666666666666666
recall: 0.8571428571428571
fbeta_score0.75
-------------------------------
sortings


array([[688,  11],
       [  8,  22]])

precision: 0.6666666666666666
recall: 0.7333333333333333
fbeta_score0.6984126984126984
-------------------------------
strings


array([[664,  11],
       [ 24,  30]])

precision: 0.7317073170731707
recall: 0.5555555555555556
fbeta_score0.631578947368421
-------------------------------
two pointers


array([[716,   1],
       [  8,   4]])

precision: 0.8
recall: 0.3333333333333333
fbeta_score0.47058823529411764
average f1: 0.6645162593137723
