# Imports

In [1]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import datasets, svm, tree, metrics
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
from pandas import DataFrame
from pathlib import Path
import collections
import numpy as np
import subprocess
import itertools
import os.path
import random
import time
import json
import sys
import re

# Load Data

In [2]:
with open('input_data.json') as data:
    jdata = json.load(data)

# Clean the Data

In [3]:
def codeToList(code):
    clean_string = ' '.join(code.split())
    the_list = re.split(' |\r\n',clean_string)
    return the_list





# Extract the Vectors 

In [4]:

samples = []
labels = []
index = []
for problem in jdata:
    for submissions in problem['Submissions']:
        for submission in submissions:
            code = submissions[submission]['Code'][0]
            
            
            samples.append( codeToList(code))
            labels.append(str(problem['contestId']) + problem['index'])
            
            index.append(submission)

print(samples)



# Train the NLP Model

In [5]:
%%time
model = Word2Vec(samples, size=100, window=2, min_count=1, workers=4, iter=2)


CPU times: user 937 ms, sys: 6.66 ms, total: 943 ms
Wall time: 551 ms


In [6]:
for sample in samples:
    vector = model.wv[sample]


# Create a List of Samples

In [7]:
vectorSamples = []
for sentence in samples:
    accumulatedVector = np.zeros(100)
    count =0
    for word in sentence:
        acumulatedVector = model.wv[word] + accumulatedVector  
        count=count+1
    
    vectorSamples.append(acumulatedVector/count)

    

# Split the Data

In [8]:
samples_train, samples_test, labels_train, labels_test = train_test_split(
    vectorSamples, labels, test_size=0.33, random_state=42)


labels_train

samples_test

labels_test

samples_train


[array([ 0.01044037,  0.00531034, -0.00620627, -0.0014901 ,  0.009351  ,
        -0.00387529,  0.01213299, -0.00895408,  0.00500612,  0.00679489,
        -0.0045384 ,  0.00653763, -0.0211406 , -0.01814213,  0.01720517,
        -0.00171246,  0.00224558,  0.00037911,  0.0125823 ,  0.00925939,
         0.00289022,  0.0076284 , -0.01412735, -0.00371629,  0.00354888,
        -0.00607819, -0.00987454, -0.01626144, -0.00765359, -0.00931231,
        -0.02960304,  0.00833433,  0.00987673,  0.00657671, -0.01705932,
         0.01369909, -0.00669389, -0.00108935,  0.0318217 , -0.00530599,
         0.01863231, -0.01511952,  0.0254228 , -0.00579325, -0.00439229,
        -0.0025933 ,  0.00683951, -0.01169354, -0.00193754,  0.00491735,
         0.02311845, -0.01016538, -0.02720845,  0.00276858, -0.00785022,
         0.01684267, -0.00096572,  0.00844589, -0.01228949,  0.0003408 ,
         0.00279587, -0.01540324, -0.0120295 ,  0.00819385,  0.00684018,
        -0.0229044 , -0.01056582,  0.00754064, -0.0

# Create the Machine Learning Algorithms

In [9]:
svm_classifier = svm.SVC(kernel='linear',class_weight='balanced')


dt_classifier = tree.DecisionTreeClassifier(class_weight='balanced')


rf_classifier = RandomForestClassifier(n_estimators=10,class_weight='balanced')


nn_classifier = MLPClassifier(hidden_layer_sizes=(100,70),max_iter=500)



# Scoring with vectorSamples and labels

In [10]:
svm_scores = cross_val_score(svm_classifier, vectorSamples, labels, cv=5)
print('SVM scores: ' + str(svm_scores))
svm_average = sum(svm_scores)/5
print('SVM scores average: ' + str(svm_average))

dt_scores = cross_val_score(dt_classifier, vectorSamples, labels, cv=5)
print('DT scores:  ' + str(dt_scores))
dt_average = sum(dt_scores)/5
print('DT scores average: ' + str(dt_average))

rf_scores = cross_val_score(rf_classifier, vectorSamples, labels, cv=5)
print('RF scores:  ' + str(rf_scores))
rf_average = sum(rf_scores)/5
print('RF scores average: ' + str(rf_average))


nn_scores = cross_val_score(nn_classifier, vectorSamples, labels, cv=5)
print('NN scores:  ' + str(nn_scores))
nn_average = sum(nn_scores)/5
print('NN scores average: ' + str(nn_average))

SVM scores: [0.08427673 0.08556833 0.05398458 0.07591623 0.08454425]
SVM scores average: 0.07685802326842286
DT scores:  [0.06918239 0.0651341  0.08868895 0.08507853 0.06869221]
DT scores average: 0.07535523513548423
RF scores:  [0.07798742 0.07024266 0.0848329  0.08638743 0.07265522]
RF scores average: 0.07842112704762946
NN scores:  [0.11194969 0.09578544 0.10282776 0.10863874 0.10303831]
NN scores average: 0.10444798844283736


# Scoring with samples_test and labels_test

In [11]:
svm_scores = cross_val_score(svm_classifier, samples_test, labels_test, cv=5)
print('SVM scores: ' + str(svm_scores))
svm_average = sum(svm_scores)/5
print('SVM scores average: ' + str(svm_average))

dt_scores = cross_val_score(dt_classifier, samples_test, labels_test, cv=5)
print('DT scores:  ' + str(dt_scores))
dt_average = sum(dt_scores)/5
print('DT scores average: ' + str(dt_average))

rf_scores = cross_val_score(rf_classifier, samples_test, labels_test, cv=5)
print('RF scores:  ' + str(rf_scores))
rf_average = sum(rf_scores)/5
print('RF scores average: ' + str(rf_average))

nn_scores = cross_val_score(nn_classifier, samples_test, labels_test, cv=5)
print('NN scores:  ' + str(nn_scores))
nn_average = sum(nn_scores)/5
print('NN scores average: ' + str(nn_average))



SVM scores: [0.04347826 0.04135338 0.04247104 0.04878049 0.06866953]
SVM scores average: 0.04895054050022561
DT scores:  [0.0615942  0.05263158 0.05405405 0.03658537 0.03433476]
DT scores average: 0.04783999314042592




RF scores:  [0.06884058 0.05639098 0.04247104 0.03252033 0.04291845]
RF scores average: 0.048628275952734154




NN scores:  [0.09057971 0.08270677 0.08880309 0.08130081 0.10729614]
NN scores average: 0.09013730324249909


# Fit the Model

In [12]:
svm_classifier = svm.SVC(kernel='linear',class_weight='balanced')


dt_classifier = tree.DecisionTreeClassifier(class_weight='balanced')


rf_classifier = RandomForestClassifier(n_estimators=10,class_weight='balanced')


nn_classifier = MLPClassifier(hidden_layer_sizes=(100,70),max_iter=500)

svm_classifier.fit(samples_train,labels_train)
dt_classifier.fit(samples_train,labels_train)
rf_classifier.fit(samples_train,labels_train)
nn_classifier.fit(samples_train,labels_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 70), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

# Final Scores

In [13]:
score = svm_classifier.score(samples_test, labels_test)
print('SVM score: ' + str(score))
score = dt_classifier.score(samples_test, labels_test)
print('DT score: ' + str(score))
score = rf_classifier.score(samples_test, labels_test)
print('RF score: ' + str(score))
score = nn_classifier.score(samples_test, labels_test)
print('NN score: ' + str(score))

SVM score: 0.06953125
DT score: 0.08046875
RF score: 0.08125
NN score: 0.09765625
