In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from itertools import product
from tqdm import tqdm_notebook as tqdm

In [8]:
conn = sqlite3.connect('finalTrain.db')
data = pd.read_sql_query('SELECT * FROM train', conn)

In [14]:
data.head()

Unnamed: 0,Id,Title,Tags,code,body,possbleCpp,possbleJava,possbleC#
0,1,check upload file imag without mime type,php image-processing file-upload upload mime-t...,,like check upload file imag file eg png jpg jp...,0,0,0
1,2,prevent firefox close press ctrl-w,firefox,,favorit editor vim regular use ctrlw execut ce...,0,0,0
2,3,r error invalid type list variabl,r matlab machine-learning,Error in model.frame.default(formula = expert_...,import matlab file construct data frame matlab...,0,0,0
3,4,replac special charact url,c# url encoding,,probabl simpl simpli find answer basicali want...,0,0,0
4,5,modifi whoi contact detail,php api file-get-contents,function modify(.......)\n{\n $mcontact = fil...,use modifi function display warn mesag pleas h...,0,0,0


In [15]:
data.shape

(2000001, 8)

### Associations for the title

##### Storing the combinations of tags and 1-grams

In [29]:
with open('title_associations.csv', 'w') as w:
    for index, row in tqdm(data.iterrows()):
        for x, y in product(np.unique(row['Title'].split()), np.unique(row['Tags'].split())):
            w.write("{},{}\n".format(x, y))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




##### Reading from the associations file

In [None]:
title_ass = pd.read_csv('title_associations.csv', error_bad_lines=False, header=None, verbose=False)

In [31]:
title_ass['pairs'] = title_ass[0] + ' ' + title_ass[1]
title_ass.dropna(subset=[0], inplace=True)
print(title_ass.shape)  ## WOW 32+ million associations just from 1-gram and on 2 million datapoints
CoOccurCounts = Counter(title_ass.pairs)
title_ass.head()

(32444539, 3)


Unnamed: 0,0,1,pairs
0,check,file-upload,check file-upload
1,check,image-processing,check image-processing
2,check,mime-types,check mime-types
3,check,php,check php
4,check,upload,check upload


In [32]:
len(CoOccurCounts)

6685737

##### Storing the occurances for 1-grams

In [33]:
with open('title_words.csv', 'w') as w:
    for each in tqdm(data.Title):
        t = np.unique(each.split())
        for word in t:
            w.write("{}\n".format(word))

HBox(children=(IntProgress(value=0, max=1999984), HTML(value='')))




##### Loading the 1-gram occurance 

In [34]:
words = defaultdict(int)

In [35]:
with open('title_words.csv', 'r') as r:
    for w in tqdm(r):
        words[w.strip()] += 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




##### Calculate the probability for associations with title and tags

In [41]:
missed_tokens = []

In [42]:
probability_title = {}
for key, value in tqdm(CoOccurCounts.items()):
    try:
        token, tag = key.split()
        probability_title[token + ' ' + tag] = value / words[token]
    except AttributeError:
        print(key)
        pass
    except KeyError:
        missed_tokens.append(token)

HBox(children=(IntProgress(value=0, max=6685737), HTML(value='')))

nan



In [44]:
assert(len(missed_tokens) == 0)

In [45]:
pc_title = pd.DataFrame.from_dict(probability_title, orient='index')
pc_title.reset_index(inplace=True)
pc_title.columns = ['pairs', 'pobability']
pc_title['token'] = pc_title.pairs.apply(lambda x: x.split()[0])
pc_title['tag'] = pc_title.pairs.apply(lambda x: x.split()[1])
pc_title.drop('pairs', axis=1, inplace=True)
pc_title.to_csv('title_associations.csv', index=False)

In [46]:
pc_title.shape

(6685736, 3)

In [51]:
pc_title.set_index(['token', 'tag'], inplace=True)

##### Testing if it really worked or not

In [56]:
pc_title.loc['pointer'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
c++,0.432676
pointers,0.379463
c,0.289527
arrays,0.060694
java,0.055763
android,0.049473
function-pointers,0.046243
objective-c,0.045223
c#,0.036892
nullpointerexception,0.031622


In [57]:
pc_title.loc['jdk'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
java,0.695279
jdk,0.358369
eclipse,0.087983
jre,0.064378
jdk1.6,0.060086
linux,0.051502
java-7,0.051502
osx,0.04721
android,0.04721
netbeans,0.036481


In [58]:
pc_title.loc['android'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
android,0.969749
java,0.121162
android-layout,0.039321
eclipse,0.030356
sqlite,0.023765
android-intent,0.021445
listview,0.021181
android-emulator,0.019107
xml,0.018597
activity,0.015486


In [59]:
pc_title.loc['latex'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
latex,0.301904
graphics,0.040873
fonts,0.03948
pdf,0.035764
beamer,0.032513
tables,0.028797
math-mode,0.027868
compiling,0.026475
packages,0.02601
conversion,0.025546


In [60]:
pc_title.loc['matplotlib'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
matplotlib,0.969496
python,0.795756
plot,0.088859
numpy,0.076923
pylab,0.03183
scipy,0.03183
graph,0.030504
wxpython,0.027851
pyplot,0.025199
image,0.018568


In [82]:
pc_title.loc['pearson'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
statistics,0.482759
correlation,0.448276
pearson,0.275862
python,0.206897
r,0.172414
recommendation-engine,0.103448
algorithm,0.103448
scipy,0.103448
machine-learning,0.068966
computer-science,0.034483


### Associations for the body

##### Storing the combinations of tags and 1-grams

In [61]:
with open('body_associations.csv', 'w') as w:
    for index, row in tqdm(data.iterrows()):
        for x, y in product(np.unique(row['body'].split()), np.unique(row['Tags'].split())):
            w.write("{},{}\n".format(x, y))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




##### Reading from the associations file

In [4]:
CoOccurCountsBody = defaultdict(int)

In [5]:
with open('body_associations.csv', 'r') as r:
    for each in tqdm(r):
        CoOccurCountsBody[each.strip()] += 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




##### Storing the occurances for 1-grams

In [9]:
with open('body_words.csv', 'w') as w:
    for each in tqdm(data.body):
        t = np.unique(each.split())
        for word in t:
            w.write("{}\n".format(word))

HBox(children=(IntProgress(value=0, max=2000001), HTML(value='')))




##### Loading the 1-gram occurance

In [10]:
words = defaultdict(int)

In [11]:
with open('body_words.csv', 'r') as r:
    for w in tqdm(r):
        words[w.strip()] += 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




##### Calculate the probability for associations with body and tags

In [20]:
missed_tokens = []

In [22]:
probability_body = {}
for key, value in tqdm(CoOccurCountsBody.items()):
    try:
        token, tag = key.split(',')
        probability_body[token + ' ' + tag] = value / words[token]
    except AttributeError:
        print(key)
        pass
    except KeyError:
        missed_tokens.append(token)

HBox(children=(IntProgress(value=0, max=29218874), HTML(value='')))




In [15]:
assert(len(missed_tokens) == 0)

In [29]:
pc_body = pd.DataFrame.from_dict(probability_body, orient='index')
pc_body.reset_index(inplace=True)
pc_body.columns = ['pairs', 'pobability']
pc_body['token'] = pc_body.pairs.apply(lambda x: x.split()[0])
pc_body['tag'] = pc_body.pairs.apply(lambda x: x.split()[1])
pc_body.drop('pairs', axis=1, inplace=True)
pc_body.to_csv('body_associations.csv', index=False)

In [30]:
pc_body.head()

Unnamed: 0,pobability,token,tag
0,0.001374,anoth,file-upload
1,0.001472,anoth,image-processing
2,0.000182,anoth,mime-types
3,0.067169,anoth,php
4,0.000876,anoth,upload


In [31]:
pc_body.shape # Almost 292 million combinations

(29218874, 3)

In [34]:
pc_body.set_index(['token', 'tag'], inplace=True)

##### Now see the magic

In [35]:
pc_body.loc['pointer'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
c++,0.247853
c,0.137784
pointers,0.104261
java,0.067543
c#,0.058043
android,0.047339
objective-c,0.037713
javascript,0.031739
arrays,0.031075
iphone,0.026304


In [37]:
pc_body.loc['maven'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
maven,0.515672
java,0.36998
maven-2,0.209954
eclipse,0.139392
spring,0.068042
maven-3,0.057174
maven-plugin,0.042054
tomcat,0.031659
hibernate,0.030398
android,0.028823


In [38]:
pc_body.loc['xcode'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
xcode,0.478749
iphone,0.296004
ios,0.285896
objective-c,0.241802
xcode4,0.091213
osx,0.088507
cocoa,0.050382
c++,0.044094
ipad,0.043537
ios5,0.038443


In [42]:
pc_body.loc['windows'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
windows,0.205882
c#,0.147059
windows-7,0.117647
java,0.117647
windows-xp,0.117647
.net,0.088235
networking,0.088235
css,0.088235
boot,0.058824
jquery,0.058824


In [48]:
pc_body.loc['amazon'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
amazon-ec2,0.270845
amazon-web-services,0.181956
amazon-s3,0.160684
php,0.083001
amazon,0.074264
linux,0.053941
ruby-on-rails,0.047104
mysql,0.045774
java,0.043115
python,0.032669


In [49]:
pc_body.loc['google'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
google,0.176471
java,0.117647
android,0.117647
seo,0.117647
pivot-table,0.117647
javascript,0.117647
scalability,0.117647
dns,0.058824
multiple,0.058824
dvdfab,0.058824


In [50]:
pc_body.loc['index'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
php,0.084115
mysql,0.082552
c#,0.078102
java,0.059893
sql,0.056122
ruby-on-rails,0.048308
javascript,0.044707
jquery,0.04097
python,0.040597
sql-server,0.039577


In [60]:
pc_body.loc['trigger'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
jquery,0.170068
javascript,0.152163
c#,0.07667
triggers,0.070097
android,0.049299
java,0.047091
php,0.041205
sql,0.040224
asp.net,0.040126
wpf,0.040027


In [61]:
pc_body.loc['callback'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
javascript,0.203323
jquery,0.189181
callback,0.105716
c#,0.087718
ajax,0.06784
android,0.06695
c++,0.051424
php,0.045095
ruby-on-rails,0.044699
ios,0.037876


In [75]:
pc_body.loc['outlier'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
r,0.214286
statistics,0.155844
outliers,0.12987
python,0.090909
boxplot,0.064935
performance,0.058442
ggplot2,0.058442
matlab,0.051948
mysql,0.045455
c++,0.045455


In [79]:
pc_body.loc['lenovo'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
windows-7,0.281972
lenovo,0.252696
laptop,0.144838
thinkpad,0.093991
windows,0.087827
ubuntu,0.077042
linux,0.07396
wireless-networking,0.060092
windows-xp,0.060092
drivers,0.053929


In [114]:
pc_body.loc['cocca'].sort_values('pobability', ascending=False).head(10)

Unnamed: 0_level_0,pobability
tag,Unnamed: 1_level_1
ios,1.0
ipad,1.0
uiview,1.0
xcode,1.0


In [115]:
!ls -l

total 14589836
-rw-rw-r-- 1 paperspace paperspace      75325 Nov 20 09:06 Associations.ipynb
-rw-rw-r-- 1 paperspace paperspace 1089032223 Nov 20 08:42 body_associations.csv
-rw-rw-r-- 1 paperspace paperspace  434272642 Nov 20 08:26 body_words.csv
-rw-r--r-- 1 paperspace paperspace 5811257344 Nov 18 03:49 cleaned.db
-rw-rw-r-- 1 paperspace paperspace  191454906 Nov 19 08:46 condCounts.pkl
drwxrwxr-x 2 paperspace paperspace       4096 Nov 17 15:59 dataset
-rw-r--r-- 1 paperspace paperspace 1991868416 Nov 19 05:34 finalTrain.db
-rw-rw-r-- 1 paperspace paperspace     354950 Nov 19 05:37 Processing.ipynb
-rw-r--r-- 1 paperspace paperspace 2260496384 Nov 18 18:57 sampleCleaned.db
-rw-r--r-- 1 paperspace paperspace 2744045568 Nov 18 15:16 samples.db
-rw-rw-r-- 1 paperspace paperspace  244535660 Nov 20 07:09 title_associations.csv
-rw-rw-r-- 1 paperspace paperspace   88239949 Nov 19 11:27 titles.csv
-rw-rw-r-- 1 paperspace paperspace   71695985 Nov 20 07:02 title_words.csv
-rw-r