In [3]:
### Parsing intent,snippet pairs from CoNaLa dataset

import pandas as pd
import json
from nltk.tokenize import RegexpTokenizer

desc = []
snippets = []

def tokenize_code(text):
    "A very basic procedure for tokenizing code strings."
    return RegexpTokenizer(r'\w+').tokenize(text)

with open('/home/larumuga/Desktop/code-text-pairs/conala-corpus/conala-mined.jsonl') as f:
    for line in f:
        data = json.loads(line)
        code_tokens = ' '.join(tokenize_code(data['snippet']))
        desc.append(data['intent'])
        snippets.append(code_tokens)

In [4]:
print(len(desc))
print(len(snippets))

593891
593891


In [5]:
df = pd.DataFrame()

df['intent'] = desc
df['snippets'] = snippets

In [6]:
df.head()

Unnamed: 0,intent,snippets
0,Sort a nested list by two elements,sorted l key lambda x int x 1 x 0
1,converting integer to list in python,int x for x in str num
2,Converting byte string in unicode string,c decode unicode_escape
3,List of arguments with argparse,parser add_argument t dest table help nargs
4,How to convert a Date string to a DateTime obj...,datetime datetime strptime s Y m dT H M SZ


In [10]:
from pathlib import Path

def write_to(df, filename, path='/home/larumuga/Desktop/code-text-pairs/preprocessed_so/'):
    "Helper function to write processed files to disk."
    out = Path(path)
    out.mkdir(exist_ok=True)
    df.intent.to_csv(out/'{}.intent'.format(filename), index=False)
    df.snippets.to_csv(out/'{}.snippets'.format(filename), index=False)

In [11]:
write_to(df, 'train')

In [23]:
### parsing SO data to be fed into training language model 

import pymysql

# Connect to the database
connection = pymysql.connect(host='localhost',
                             user='sotorrent',
                             password='sotorrent',
                             db='sotorrent18_09',
                             charset='utf8mb4')

posts_df = pd.read_sql('select id, `AcceptedAnswerId`, body, `Title` from Posts where body like "%<code>%" and AcceptedAnswerId is not NULL and tags like "%python%" and PostTypeId=1', con=connection)
posts_df.head()

Unnamed: 0,id,AcceptedAnswerId,body,Title
0,683,57833,<p>I don't remember whether I was dreaming or ...,Using 'in' to match an attribute of Python obj...
1,773,7286,<p>I haven't been able to find an understandab...,How do I use Python's itertools.groupby()?
2,1171,28705,<p>I need to be able to manipulate a large (10...,What is the most efficient graph data structur...
3,1476,13107,<p>How do you express an integer as a binary n...,How do you express binary literals in Python?
4,1829,1852,<p>I've got a menu in Python. That part was ea...,How do I make a menu that does not require the...


In [25]:
%%time
ans_df = pd.read_sql('select id, body as answer_body, title as answer_title from Posts where PostTypeId=2 and id in (select `AcceptedAnswerId` from Posts where body like "%<code>%" and AcceptedAnswerId is not NULL and tags like "%python%" and PostTypeId=1)', con=connection)
ans_df.head()


Unnamed: 0,id,answer_body,answer_title
0,57833,<p>Using a list comprehension would build a te...,
1,7286,"<p>As Sebastjan said, <strong>you first have t...",
2,28705,<p>I would strongly advocate you look at <a hr...,
3,13107,<p>For reference&mdash;<em>future</em> Python ...,
4,1852,<p><strong>On Windows:</strong></p>&#xA;&#xA;<...,


In [26]:
new_df = pd.merge(posts_df, ans_df,  how='left', left_on=['AcceptedAnswerId'], right_on = ['id'])
new_df.head()

Unnamed: 0,id_x,AcceptedAnswerId,body,Title,id_y,answer_body,answer_title
0,683,57833,<p>I don't remember whether I was dreaming or ...,Using 'in' to match an attribute of Python obj...,57833.0,<p>Using a list comprehension would build a te...,
1,773,7286,<p>I haven't been able to find an understandab...,How do I use Python's itertools.groupby()?,7286.0,"<p>As Sebastjan said, <strong>you first have t...",
2,1171,28705,<p>I need to be able to manipulate a large (10...,What is the most efficient graph data structur...,28705.0,<p>I would strongly advocate you look at <a hr...,
3,1476,13107,<p>How do you express an integer as a binary n...,How do you express binary literals in Python?,13107.0,<p>For reference&mdash;<em>future</em> Python ...,
4,1829,1852,<p>I've got a menu in Python. That part was ea...,How do I make a menu that does not require the...,1852.0,<p><strong>On Windows:</strong></p>&#xA;&#xA;<...,


In [41]:
import spacy
import nltk

from bs4 import BeautifulSoup
EN = spacy.load('en')

def tokenize_docstring(text):
    "Apply tokenization using spacy to docstrings."
    tokens = EN.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]


new_df['tokenized_strings'] = new_df.iloc[:,[2,3,5]].apply(lambda x: ' '.join(x.astype(str)), axis=1)
new_df.head()

# raw = BeautifulSoup(new_df.iloc[0]['body']).get_text()
# print(raw)

Unnamed: 0,id_x,AcceptedAnswerId,body,Title,id_y,answer_body,answer_title,tokenized_strings
0,683,57833,<p>I don't remember whether I was dreaming or ...,Using 'in' to match an attribute of Python obj...,57833.0,<p>Using a list comprehension would build a te...,,<p>I don't remember whether I was dreaming or ...
1,773,7286,<p>I haven't been able to find an understandab...,How do I use Python's itertools.groupby()?,7286.0,"<p>As Sebastjan said, <strong>you first have t...",,<p>I haven't been able to find an understandab...
2,1171,28705,<p>I need to be able to manipulate a large (10...,What is the most efficient graph data structur...,28705.0,<p>I would strongly advocate you look at <a hr...,,<p>I need to be able to manipulate a large (10...
3,1476,13107,<p>How do you express an integer as a binary n...,How do you express binary literals in Python?,13107.0,<p>For reference&mdash;<em>future</em> Python ...,,<p>How do you express an integer as a binary n...
4,1829,1852,<p>I've got a menu in Python. That part was ea...,How do I make a menu that does not require the...,1852.0,<p><strong>On Windows:</strong></p>&#xA;&#xA;<...,,<p>I've got a menu in Python. That part was ea...


In [67]:
# new_df['tokenized_strings'] = new_df['tokenized_strings'].apply(lambda x: tokenize_docstring(BeautifulSoup(x).get_text()))
# new_df.head()

path='/home/larumuga/Desktop/code-text-pairs/preprocessed_so/'
new_df.tokenized_strings.to_csv(path+'train.sostrings', index=False)

In [80]:
new_df.shape

(536464, 9)

In [81]:
%%time

def func(a):
    st = BeautifulSoup(a).get_text()
    st = st.replace('\r',' ')
    st = st.replace('\n',' ')
    return st

tokens = new_df.tokenized_strings.map(lambda a: func(a))

CPU times: user 7min 48s, sys: 28 ms, total: 7min 48s
Wall time: 7min 48s


In [82]:
len(tokens)

536464

In [83]:
new_df['tokens'] = tokens
path='/home/larumuga/Desktop/code-text-pairs/preprocessed_so/'
new_df.tokens.to_csv(path+'train_text.sostrings', index=False)

In [84]:
new_df.drop(columns='tokenized_strings1')
new_df.head()

Unnamed: 0,id_x,AcceptedAnswerId,body,Title,id_y,answer_body,answer_title,tokenized_strings,tokenized_strings1,tokens
0,683,57833,<p>I don't remember whether I was dreaming or ...,Using 'in' to match an attribute of Python obj...,57833.0,<p>Using a list comprehension would build a te...,,<p>I don't remember whether I was dreaming or ...,I don't remember whether I was dreaming or not...,I don't remember whether I was dreaming or not...
1,773,7286,<p>I haven't been able to find an understandab...,How do I use Python's itertools.groupby()?,7286.0,"<p>As Sebastjan said, <strong>you first have t...",,<p>I haven't been able to find an understandab...,I haven't been able to find an understandable ...,I haven't been able to find an understandable ...
2,1171,28705,<p>I need to be able to manipulate a large (10...,What is the most efficient graph data structur...,28705.0,<p>I would strongly advocate you look at <a hr...,,<p>I need to be able to manipulate a large (10...,I need to be able to manipulate a large (10^7 ...,I need to be able to manipulate a large (10^7 ...
3,1476,13107,<p>How do you express an integer as a binary n...,How do you express binary literals in Python?,13107.0,<p>For reference&mdash;<em>future</em> Python ...,,<p>How do you express an integer as a binary n...,How do you express an integer as a binary numb...,How do you express an integer as a binary numb...
4,1829,1852,<p>I've got a menu in Python. That part was ea...,How do I make a menu that does not require the...,1852.0,<p><strong>On Windows:</strong></p>&#xA;&#xA;<...,,<p>I've got a menu in Python. That part was ea...,I've got a menu in Python. That part was easy....,I've got a menu in Python. That part was easy....


In [92]:
%%time

##Used tuna to process this in 10mins 64 cores

from multiprocessing import Pool
from tqdm import *

def token_fn(a):
    return ' '.join(tokenize_docstring(a))

def imap_unordered_bar(func, args, n_processes = 64):
    p = Pool(n_processes)
    res_list = []
    with tqdm(total = len(args)) as pbar:
        for i, res in tqdm(enumerate(p.imap_unordered(func, args))):
            pbar.update()
            res_list.append(res)
    pbar.close()
    p.close()
    p.join()
    return res_list
    
tokenized = imap_unordered_bar(token_fn, df[0].tolist())

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


In [91]:
len(tokenized)

0