# Imports

In [1]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import datasets, svm, tree, metrics
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt
from pandas import DataFrame
from pathlib import Path
import collections
import numpy as np
import subprocess
import itertools
import os.path
import random
import time
import json
import sys
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk

# Load Data

In [2]:
with open('input_data.json') as data:
    jdata = json.load(data)

# Clean the Data

In [3]:
def codeToList(code):
    clean_string = ' '.join(code.split())
    the_list = re.split(' |\r\n',clean_string)
    return the_list


#  Function to Translate Operators to Natural Langauge 

In [4]:
def naive_word_conversion(code_string):
    
    code_string = re.sub('\>\>=', ' is reassigned after shifting right by ', code_string)
    code_string = re.sub('\<\<=', ' is reassigned after shifting left by ', code_string)
    code_string = re.sub('\<\<', ' is shifted left by ', code_string)
    code_string = re.sub('\>\>', ' is shifted right by ' , code_string)
    code_string = re.sub('\<=', ' is less than or equal to ', code_string)
    code_string = re.sub('\>=', ' is greater than or equal to ', code_string)
    code_string = re.sub('-\>', ' is dereferenced and retrieves ', code_string)
    code_string = re.sub('\<', ' is less than ', code_string)
    code_string = re.sub('\>', ' is greater than ', code_string)
    code_string = re.sub('\+\+' ,' is incremented by one ' , code_string)
    code_string = re.sub('--', ' is decremented by one ', code_string)
    code_string = re.sub('\+=', ' is reassinged after adding ' , code_string)
    code_string = re.sub('\-=', ' is reassigned after subtracting ', code_string)
    code_string = re.sub('\*=', ' is reassigned after multiplying by ', code_string)
    code_string = re.sub('\/=', ' is reassigned after dividing by ', code_string)
    code_string = re.sub('\%=', ' is reassigned after moding by ', code_string)
    code_string = re.sub('&=', ' is reassigned after a bitwise AND of ', code_string)
    code_string = re.sub('\|=', ' is reassigned after a bitwise inclusive OR of ', code_string)
    code_string = re.sub('\^=', ' is reassigned after a bitwise exclusive OR of ', code_string)
    code_string = re.sub('==', ' is equal to ', code_string)
    code_string = re.sub('\!=', ' is not equal to ', code_string)
    code_string = re.sub('&&', ' and ', code_string)
    code_string = re.sub('\|\|', ' or ', code_string)
    code_string = re.sub('\?', ' is true then ', code_string)
    code_string = re.sub('\:', ' else is false then ', code_string)
    code_string = re.sub('\/\/', '  ', code_string)
    code_string = re.sub('=', ' equals ', code_string)
    code_string = re.sub('\.', ' uses ', code_string)
    code_string = re.sub(' -{1} ', ' is subtracted from ', code_string)
    code_string = re.sub('\+{1}', ' is added to ', code_string)
    code_string = re.sub('\/\*([^\*]|(\*+([^\*\/])))*\*\/', ' ', code_string)  
    code_string = re.sub('\*{1}', ' is multiplied by ', code_string)
    code_string = re.sub('&{1}', ' has an AND done with ', code_string)
    code_string = re.sub('\/{1}', ' is divided by ', code_string)
    code_string = re.sub('\%{1}', ' is moded by ', code_string)
    code_string = re.sub('\!{1}', ' is negated ', code_string)
    code_string = re.sub('\|{1}', ' has an OR done with ', code_string)
    code_string = re.sub('\^{1}', ' has an X OR done with ', code_string)
    code_string = re.sub('\(', ' open parenthesis ', code_string)
    code_string = re.sub('\)', ' close parenthesis ', code_string)
    code_string = re.sub('\[', ' open bracket ', code_string)
    code_string = re.sub('\]', ' close bracket ', code_string)
    code_string = re.sub('\#', ' ', code_string)
    code_string = re.sub(';', ' . ', code_string)
    
    return code_string

# Extract the Vectors 

In [5]:
samples = []
labels = []
index = []
for problem in jdata:
    for submissions in problem['Submissions']:
        for submission in submissions:
            code = submissions[submission]['Code'][0]
            code = naive_word_conversion(code)
            
            samples.append( codeToList(code))
            labels.append(str(problem['contestId']) + problem['index'])
            
            index.append(submission)

print(samples[0:10])

[['include', 'is', 'less', 'than', 'stdio', 'uses', 'h', 'is', 'greater', 'than', 'include', 'is', 'less', 'than', 'string', 'uses', 'h', 'is', 'greater', 'than', 'int', 'len', '.', 'char', 'data', 'open', 'bracket', '200', 'close', 'bracket', '.', 'int', 'main', 'open', 'parenthesis', 'close', 'parenthesis', '{', 'int', 'i,', 'cnt', 'equals', '0', '.', 'scanf', 'open', 'parenthesis', '"', 'is', 'moded', 'by', 's",', 'data', 'close', 'parenthesis', '.', 'len', 'equals', 'strlen', 'open', 'parenthesis', 'data', 'close', 'parenthesis', '.', 'for', 'open', 'parenthesis', 'i', 'equals', '0', '.', 'i', 'is', 'less', 'than', 'len', '.', 'i', 'is', 'incremented', 'by', 'one', 'close', 'parenthesis', '{', 'if', 'open', 'parenthesis', 'data', 'open', 'bracket', 'i', 'close', 'bracket', 'is', 'equal', 'to', "'o'", 'close', 'parenthesis', 'cnt', 'is', 'incremented', 'by', 'one', '.', '}', 'if', 'open', 'parenthesis', 'cnt', 'is', 'equal', 'to', '0', 'close', 'parenthesis', 'printf', 'open', 'pare

In [33]:
print(np.unique(labels))
len(np.unique(labels))

['926A' '926C' '926G' '931A' '931B' '932A' '932B' '934A' '934B' '935A'
 '935B' '935C' '937A' '937B' '938A' '938B' '939A' '939B' '940A' '940B'
 '940C' '946A' '946B' '946C' '948A' '950A' '950B' '952A' '952C' '954A'
 '954B' '955A' '957A' '959A' '960A' '961A' '961B' '962A' '962B' '964A'
 '965A' '967A' '975B' '976A' '976B' '977A' '977B' '978A' '978B' '978C'
 '979A' '980A']


52

# Train the NLP Model

In [22]:
%%time
max_epochs = 70
vec_size = 300
alpha = 0.025

model = Doc2Vec(vector_size  = vec_size, alpha = alpha, min_alpha = 0.00025, min_count = 1,dm = 1)


CPU times: user 816 µs, sys: 10 µs, total: 826 µs
Wall time: 831 µs


In [38]:
tag_labels=[[str(_d)] for i, _d in enumerate(labels)]

In [23]:
samplesSingleList = []
for eachList in samples: 
    str1 = ' '.join(eachList)
    samplesSingleList.append(str1)

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(samplesSingleList)]

# building the vocabulary 
model.build_vocab(tagged_data)

In [24]:
for epoch in range(max_epochs):
    model.train(tagged_data,total_examples = model.corpus_count,epochs = model.epochs)

In [45]:
[[str(i)] for i, _d in enumerate(samplesSingleList)]

[['0'],
 ['1'],
 ['2'],
 ['3'],
 ['4'],
 ['5'],
 ['6'],
 ['7'],
 ['8'],
 ['9'],
 ['10'],
 ['11'],
 ['12'],
 ['13'],
 ['14'],
 ['15'],
 ['16'],
 ['17'],
 ['18'],
 ['19'],
 ['20'],
 ['21'],
 ['22'],
 ['23'],
 ['24'],
 ['25'],
 ['26'],
 ['27'],
 ['28'],
 ['29'],
 ['30'],
 ['31'],
 ['32'],
 ['33'],
 ['34'],
 ['35'],
 ['36'],
 ['37'],
 ['38'],
 ['39'],
 ['40'],
 ['41'],
 ['42'],
 ['43'],
 ['44'],
 ['45'],
 ['46'],
 ['47'],
 ['48'],
 ['49'],
 ['50'],
 ['51'],
 ['52'],
 ['53'],
 ['54'],
 ['55'],
 ['56'],
 ['57'],
 ['58'],
 ['59'],
 ['60'],
 ['61'],
 ['62'],
 ['63'],
 ['64'],
 ['65'],
 ['66'],
 ['67'],
 ['68'],
 ['69'],
 ['70'],
 ['71'],
 ['72'],
 ['73'],
 ['74'],
 ['75'],
 ['76'],
 ['77'],
 ['78'],
 ['79'],
 ['80'],
 ['81'],
 ['82'],
 ['83'],
 ['84'],
 ['85'],
 ['86'],
 ['87'],
 ['88'],
 ['89'],
 ['90'],
 ['91'],
 ['92'],
 ['93'],
 ['94'],
 ['95'],
 ['96'],
 ['97'],
 ['98'],
 ['99'],
 ['100'],
 ['101'],
 ['102'],
 ['103'],
 ['104'],
 ['105'],
 ['106'],
 ['107'],
 ['108'],
 ['109'],
 ['110'],


# Create a List of Samples

In [25]:
vectorSamples = [] 

for eachString in samplesSingleList:
    vectorSamples.append(model.infer_vector(eachString))
    

# Split the Data

In [26]:
samples_train, samples_test, labels_train, labels_test = train_test_split(
    vectorSamples, labels, test_size=0.33, random_state=42)


labels_train

samples_test

labels_test

samples_train


[array([ 8.25972497e-01, -5.55234492e-01,  3.63263875e-01, -8.32501769e-01,
         4.13636684e+00,  2.35381460e+00,  3.23731756e+00,  7.56042838e-01,
        -9.17149961e-01,  3.34040523e-01,  6.33627772e-01,  7.68256426e-01,
        -2.18995142e+00,  1.48564208e+00,  9.74323928e-01, -3.15784007e-01,
         6.26544237e-01,  1.77331328e-01,  4.04398412e-01,  1.63209665e+00,
        -2.35122132e+00,  7.59258926e-01, -2.59717166e-01,  7.88265288e-01,
        -2.08105516e+00, -1.47584176e+00, -9.54808116e-01,  1.12162173e+00,
        -1.27192378e+00, -3.26192677e-02, -1.22902358e+00, -3.84036660e-01,
        -5.55715151e-02, -1.52589187e-01, -1.97471118e+00,  3.63649607e-01,
         1.32456863e+00,  4.61687654e-01, -1.64612103e+00,  4.59089041e-01,
        -9.24545340e-03,  1.01660180e+00,  3.05132240e-01,  9.47138965e-01,
        -3.08442807e+00, -7.02163815e-01, -2.54725552e+00,  1.26193821e+00,
         7.47009933e-01,  7.50103295e-01, -1.52818453e+00, -3.84699344e-03,
         1.7

# Create the Machine Learning Algorithms

In [27]:
svm_classifier = svm.SVC(kernel='linear',class_weight='balanced')


dt_classifier = tree.DecisionTreeClassifier(class_weight='balanced')


rf_classifier = RandomForestClassifier(n_estimators=10,class_weight='balanced')


nn_classifier = MLPClassifier(hidden_layer_sizes=(100,70),max_iter=500)



# Scoring with vectorSamples and labels

In [28]:
svm_scores = cross_val_score(svm_classifier, vectorSamples, labels, cv=5)
print('SVM scores: ' + str(svm_scores))
svm_average = sum(svm_scores)/5
print('SVM scores average: ' + str(svm_average))

dt_scores = cross_val_score(dt_classifier, vectorSamples, labels, cv=5)
print('DT scores:  ' + str(dt_scores))
dt_average = sum(dt_scores)/5
print('DT scores average: ' + str(dt_average))

rf_scores = cross_val_score(rf_classifier, vectorSamples, labels, cv=5)
print('RF scores:  ' + str(rf_scores))
rf_average = sum(rf_scores)/5
print('RF scores average: ' + str(rf_average))


nn_scores = cross_val_score(nn_classifier, vectorSamples, labels, cv=5)
print('NN scores:  ' + str(nn_scores))
nn_average = sum(nn_scores)/5
print('NN scores average: ' + str(nn_average))

SVM scores: [0.61761006 0.63090677 0.62210797 0.60732984 0.61955086]
SVM scores average: 0.6195011004934139
DT scores:  [0.25534591 0.29118774 0.27763496 0.27879581 0.28137384]
DT scores average: 0.27686765369854655
RF scores:  [0.36855346 0.38058748 0.37403599 0.36256545 0.41611625]
RF scores average: 0.38037172524948065
NN scores:  [0.63647799 0.62324393 0.62082262 0.60863874 0.6010568 ]
NN scores average: 0.6180480179488043


# Scoring with samples_test and labels_test

In [60]:
svm_scores = cross_val_score(svm_classifier, samples_test, labels_test, cv=5)
print('SVM scores: ' + str(svm_scores))
svm_average = sum(svm_scores)/5
print('SVM scores average: ' + str(svm_average))

dt_scores = cross_val_score(dt_classifier, samples_test, labels_test, cv=5)
print('DT scores:  ' + str(dt_scores))
dt_average = sum(dt_scores)/5
print('DT scores average: ' + str(dt_average))

rf_scores = cross_val_score(rf_classifier, samples_test, labels_test, cv=5)
print('RF scores:  ' + str(rf_scores))
rf_average = sum(rf_scores)/5
print('RF scores average: ' + str(rf_average))

nn_scores = cross_val_score(nn_classifier, samples_test, labels_test, cv=5)
print('NN scores:  ' + str(nn_scores))
nn_average = sum(nn_scores)/5
print('NN scores average: ' + str(nn_average))



SVM scores: [0.58695652 0.62030075 0.6023166  0.58130081 0.60944206]
SVM scores average: 0.6000633498058797




DT scores:  [0.21014493 0.26315789 0.26640927 0.23170732 0.22746781]
DT scores average: 0.2397774433828619




RF scores:  [0.3442029  0.37593985 0.33976834 0.35365854 0.32618026]
RF scores average: 0.3479499764078439




NN scores:  [0.33695652 0.52631579 0.51351351 0.53658537 0.527897  ]
NN scores average: 0.48825363725762827


# Fit the Model

In [61]:
svm_classifier = svm.SVC(kernel='linear',class_weight='balanced')


dt_classifier = tree.DecisionTreeClassifier(class_weight='balanced')


rf_classifier = RandomForestClassifier(n_estimators=10,class_weight='balanced')


nn_classifier = MLPClassifier(hidden_layer_sizes=(100,70),max_iter=500)

svm_classifier.fit(samples_train,labels_train)
dt_classifier.fit(samples_train,labels_train)
rf_classifier.fit(samples_train,labels_train)
nn_classifier.fit(samples_train,labels_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 70), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

# Final Scores

In [62]:
score = svm_classifier.score(samples_test, labels_test)
print('SVM score: ' + str(score))
score = dt_classifier.score(samples_test, labels_test)
print('DT score: ' + str(score))
score = rf_classifier.score(samples_test, labels_test)
print('RF score: ' + str(score))
score = nn_classifier.score(samples_test, labels_test)
print('NN score: ' + str(score))

SVM score: 0.61328125
DT score: 0.2828125
RF score: 0.378125
NN score: 0.61015625
