# Imports

In [15]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import datasets, svm, tree, metrics
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
from pandas import DataFrame
from pathlib import Path
import collections
import numpy as np
import subprocess
import itertools
import os.path
import random
import time
import json
import sys
import re

# Load the Data

In [2]:
with open('input_data.json') as data:
    jdata = json.load(data)

# Clean the Data

In [3]:
def codeToList(code):
    clean_string = ' '.join(code.split())
    the_list = re.split(' |\r\n',clean_string)
    return the_list


# Function to Translate Operators to Natural Language

In [4]:

def naive_word_conversion(code_string):
    
    code_string = re.sub('\>\>=', ' is reassigned after shifting right by ', code_string)
    code_string = re.sub('\<\<=', ' is reassigned after shifting left by ', code_string)
    code_string = re.sub('\<\<', ' is shifted left by ', code_string)
    code_string = re.sub('\>\>', ' is shifted right by ' , code_string)
    code_string = re.sub('\<=', ' is less than or equal to ', code_string)
    code_string = re.sub('\>=', ' is greater than or equal to ', code_string)
    code_string = re.sub('-\>', ' is dereferenced and retrieves ', code_string)
    code_string = re.sub('\<', ' is less than ', code_string)
    code_string = re.sub('\>', ' is greater than ', code_string)
    code_string = re.sub('\+\+' ,' is incremented by one ' , code_string)
    code_string = re.sub('--', ' is decremented by one ', code_string)
    code_string = re.sub('\+=', ' is reassinged after adding ' , code_string)
    code_string = re.sub('\-=', ' is reassigned after subtracting ', code_string)
    code_string = re.sub('\*=', ' is reassigned after multiplying by ', code_string)
    code_string = re.sub('\/=', ' is reassigned after dividing by ', code_string)
    code_string = re.sub('\%=', ' is reassigned after moding by ', code_string)
    code_string = re.sub('&=', ' is reassigned after a bitwise AND of ', code_string)
    code_string = re.sub('\|=', ' is reassigned after a bitwise inclusive OR of ', code_string)
    code_string = re.sub('\^=', ' is reassigned after a bitwise exclusive OR of ', code_string)
    code_string = re.sub('==', ' is equal to ', code_string)
    code_string = re.sub('\!=', ' is not equal to ', code_string)
    code_string = re.sub('&&', ' and ', code_string)
    code_string = re.sub('\|\|', ' or ', code_string)
    code_string = re.sub('\?', ' is true then ', code_string)
    code_string = re.sub('\:', ' else is false then ', code_string)
    code_string = re.sub('\/\/', '  ', code_string)
    code_string = re.sub('=', ' equals ', code_string)
    code_string = re.sub('\.', ' uses ', code_string)
    code_string = re.sub(' -{1} ', ' is subtracted from ', code_string)
    code_string = re.sub('\+{1}', ' is added to ', code_string)
    code_string = re.sub('\/\*([^\*]|(\*+([^\*\/])))*\*\/', ' ', code_string)  
    code_string = re.sub('\*{1}', ' is multiplied by ', code_string)
    code_string = re.sub('&{1}', ' has an AND done with ', code_string)
    code_string = re.sub('\/{1}', ' is divided by ', code_string)
    code_string = re.sub('\%{1}', ' is moded by ', code_string)
    code_string = re.sub('\!{1}', ' is negated ', code_string)
    code_string = re.sub('\|{1}', ' has an OR done with ', code_string)
    code_string = re.sub('\^{1}', ' has an X OR done with ', code_string)
    code_string = re.sub('\(', ' open parenthesis ', code_string)
    code_string = re.sub('\)', ' close parenthesis ', code_string)
    code_string = re.sub('\[', ' open bracket ', code_string)
    code_string = re.sub('\]', ' close bracket ', code_string)
    code_string = re.sub('\#', ' ', code_string)
    code_string = re.sub(';', ' . ', code_string)
    
    return code_string

# Extract the Vectors 

In [5]:
samples = []
labels = []
index = []
for problem in jdata:
    for submissions in problem['Submissions']:
        for submission in submissions:
            code = submissions[submission]['Code'][0]
            code = naive_word_conversion(code)
            
            samples.append(codeToList(code))
            labels.append(str(problem['contestId']) + problem['index'])
            
            index.append(submission)

            
print(samples[0:10])

[['include', 'is', 'less', 'than', 'stdio', 'uses', 'h', 'is', 'greater', 'than', 'include', 'is', 'less', 'than', 'string', 'uses', 'h', 'is', 'greater', 'than', 'int', 'len', '.', 'char', 'data', 'open', 'bracket', '200', 'close', 'bracket', '.', 'int', 'main', 'open', 'parenthesis', 'close', 'parenthesis', '{', 'int', 'i,', 'cnt', 'equals', '0', '.', 'scanf', 'open', 'parenthesis', '"', 'is', 'moded', 'by', 's",', 'data', 'close', 'parenthesis', '.', 'len', 'equals', 'strlen', 'open', 'parenthesis', 'data', 'close', 'parenthesis', '.', 'for', 'open', 'parenthesis', 'i', 'equals', '0', '.', 'i', 'is', 'less', 'than', 'len', '.', 'i', 'is', 'incremented', 'by', 'one', 'close', 'parenthesis', '{', 'if', 'open', 'parenthesis', 'data', 'open', 'bracket', 'i', 'close', 'bracket', 'is', 'equal', 'to', "'o'", 'close', 'parenthesis', 'cnt', 'is', 'incremented', 'by', 'one', '.', '}', 'if', 'open', 'parenthesis', 'cnt', 'is', 'equal', 'to', '0', 'close', 'parenthesis', 'printf', 'open', 'pare

# Train the NLP Model

In [6]:
%%time
model = Word2Vec(samples, size=100, window=2, min_count=1, workers=4, iter=2)


CPU times: user 1.44 s, sys: 16.8 ms, total: 1.46 s
Wall time: 590 ms


In [7]:
for sample in samples:
    vector = model.wv[sample]


# Create a List of  Samples

In [8]:
vectorSamples = []
for sentence in samples:
    accumulatedVector = np.zeros(100)
    count =0
    for word in sentence:
        acumulatedVector = model.wv[word] + accumulatedVector  
        count=count+1
    
    vectorSamples.append(acumulatedVector/count)

    

# Split the Data

In [9]:
samples_train, samples_test, labels_train, labels_test = train_test_split(
    vectorSamples, labels, test_size=0.33, random_state=42)


labels_train

samples_test

labels_test

samples_train


[array([-2.81341728e-03,  9.69661873e-04, -4.74728568e-03, -4.66331183e-03,
        -1.28925541e-04,  2.09597637e-03,  3.84508326e-03,  2.68306672e-03,
         1.59196050e-03,  2.19925809e-03, -1.88348190e-03,  9.32064034e-04,
         8.86099634e-04, -7.23240171e-04,  3.75162065e-03,  1.57810596e-03,
        -1.33991545e-03, -1.15839152e-03,  1.60238601e-03,  2.34508439e-03,
         3.69527709e-04, -2.86435929e-03,  6.38850223e-04,  3.02239470e-03,
         9.06529252e-04, -3.37605459e-05, -1.43544591e-03,  9.39073388e-04,
        -3.43670532e-03, -2.47323260e-03, -5.31023892e-03, -2.72558478e-03,
         2.05082244e-03, -1.06115854e-03,  3.37118668e-03, -3.35723541e-03,
        -4.09462947e-04, -1.31909372e-03, -1.13683686e-03, -1.00028913e-03,
         4.47083984e-03,  6.18879169e-03, -7.15053183e-03, -1.35461759e-03,
         7.06362964e-04, -1.07529873e-03,  2.29344631e-03,  7.84066206e-04,
        -5.56675256e-03, -1.51991655e-04, -8.59704563e-04, -6.44878690e-04,
         1.8

# Create the Machine Learning Algorithms

In [10]:
svm_classifier = svm.SVC(kernel='linear',class_weight='balanced')


dt_classifier = tree.DecisionTreeClassifier(class_weight='balanced')


rf_classifier = RandomForestClassifier(n_estimators=10,class_weight='balanced')


nn_classifier = MLPClassifier(hidden_layer_sizes=(100,70),max_iter=500)



# Scoring with vectorSamples and labels

In [11]:
svm_scores = cross_val_score(svm_classifier, vectorSamples, labels, cv=5)
print('SVM scores: ' + str(svm_scores))
svm_average = sum(svm_scores)/5
print('SVM scores average: ' + str(svm_average))

dt_scores = cross_val_score(dt_classifier, vectorSamples, labels, cv=5)
print('DT scores:  ' + str(dt_scores))
dt_average = sum(dt_scores)/5
print('DT scores average: ' + str(dt_average))

rf_scores = cross_val_score(rf_classifier, vectorSamples, labels, cv=5)
print('RF scores:  ' + str(rf_scores))
rf_average = sum(rf_scores)/5
print('RF scores average: ' + str(rf_average))


nn_scores = cross_val_score(nn_classifier, vectorSamples, labels, cv=5)
print('NN scores:  ' + str(nn_scores))
nn_average = sum(nn_scores)/5
print('NN scores average: ' + str(nn_average))

SVM scores: [0.02389937 0.02681992 0.00899743 0.02879581 0.04227213]
SVM scores average: 0.026156932416289507
DT scores:  [0.14465409 0.13537676 0.13496144 0.14267016 0.12945839]
DT scores average: 0.13742416582972852
RF scores:  [0.14842767 0.14176245 0.13881748 0.13874346 0.13738441]
RF scores average: 0.14102709468673352
NN scores:  [0.18113208 0.17113665 0.19794344 0.18455497 0.19682959]
NN scores average: 0.18631934768156216


# Scoring with samples_test and labels_test

In [12]:
svm_scores = cross_val_score(svm_classifier, samples_test, labels_test, cv=5)
print('SVM scores: ' + str(svm_scores))
svm_average = sum(svm_scores)/5
print('SVM scores average: ' + str(svm_average))

dt_scores = cross_val_score(dt_classifier, samples_test, labels_test, cv=5)
print('DT scores:  ' + str(dt_scores))
dt_average = sum(dt_scores)/5
print('DT scores average: ' + str(dt_average))

rf_scores = cross_val_score(rf_classifier, samples_test, labels_test, cv=5)
print('RF scores:  ' + str(rf_scores))
rf_average = sum(rf_scores)/5
print('RF scores average: ' + str(rf_average))

nn_scores = cross_val_score(nn_classifier, samples_test, labels_test, cv=5)
print('NN scores:  ' + str(nn_scores))
nn_average = sum(nn_scores)/5
print('NN scores average: ' + str(nn_average))



SVM scores: [0.03623188 0.03007519 0.02702703 0.05284553 0.01716738]
SVM scores average: 0.03266940189689127




DT scores:  [0.12318841 0.11278195 0.12741313 0.12195122 0.11587983]
DT scores average: 0.12024290718716446




RF scores:  [0.11231884 0.12030075 0.11969112 0.12195122 0.11587983]
RF scores average: 0.11802835199778088




NN scores:  [0.13768116 0.16165414 0.15444015 0.16666667 0.16309013]
NN scores average: 0.15670644892416433


# Fit the Model

In [13]:
svm_classifier = svm.SVC(kernel='linear',class_weight='balanced')

#5-20 height for decision tree
dt_classifier = tree.DecisionTreeClassifier(class_weight='balanced')


rf_classifier = RandomForestClassifier(n_estimators=10,class_weight='balanced')


nn_classifier = MLPClassifier(hidden_layer_sizes=(100,70),max_iter=500)

svm_classifier.fit(samples_train,labels_train)
dt_classifier.fit(samples_train,labels_train)
rf_classifier.fit(samples_train,labels_train)
nn_classifier.fit(samples_train,labels_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 70), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

# Final Scores

In [14]:
score = svm_classifier.score(samples_test, labels_test)
print('SVM score: ' + str(score))
score = dt_classifier.score(samples_test, labels_test)
print('DT score: ' + str(score))
score = rf_classifier.score(samples_test, labels_test)
print('RF score: ' + str(score))
score = nn_classifier.score(samples_test, labels_test)
print('NN score: ' + str(score))

SVM score: 0.01640625
DT score: 0.1265625
RF score: 0.128125
NN score: 0.18984375
