In [1]:
import pandas as pd    
import gzip
import os

def getListOfFiles(dirName):
    """Walks a directory to return a list of paths to all the files in it and it's
    sub-directories.
    Arguments:
    dirName (string): path to directory to walk
    """
    list_of_files = os.listdir(dirName)
    all_files = list()
    for entry in list_of_files:
        full_path = os.path.join(dirName, entry)
        if os.path.isdir(full_path):
            all_files = all_files + getListOfFiles(full_path)
        else:
            all_files.append(full_path)
    return all_files

In [2]:
# start by getting a file path to each language file
fp_list = getListOfFiles('final/jsonl/train')
print(fp_list)
del fp_list[2]
fp_list

['final/jsonl/train/python_train_13.jsonl', 'final/jsonl/train/python_train_11.jsonl', 'final/jsonl/train/.DS_Store', 'final/jsonl/train/python_train_8.jsonl', 'final/jsonl/train/python_train_10.jsonl', 'final/jsonl/train/python_train_12.jsonl', 'final/jsonl/train/python_train_9.jsonl', 'final/jsonl/train/python_train_7.jsonl', 'final/jsonl/train/python_train_5.jsonl', 'final/jsonl/train/python_train_1.jsonl', 'final/jsonl/train/python_train_3.jsonl', 'final/jsonl/train/python_train_4.jsonl', 'final/jsonl/train/python_train_6.jsonl', 'final/jsonl/train/python_train_2.jsonl', 'final/jsonl/train/python_train_0.jsonl']


['final/jsonl/train/python_train_13.jsonl',
 'final/jsonl/train/python_train_11.jsonl',
 'final/jsonl/train/python_train_8.jsonl',
 'final/jsonl/train/python_train_10.jsonl',
 'final/jsonl/train/python_train_12.jsonl',
 'final/jsonl/train/python_train_9.jsonl',
 'final/jsonl/train/python_train_7.jsonl',
 'final/jsonl/train/python_train_5.jsonl',
 'final/jsonl/train/python_train_1.jsonl',
 'final/jsonl/train/python_train_3.jsonl',
 'final/jsonl/train/python_train_4.jsonl',
 'final/jsonl/train/python_train_6.jsonl',
 'final/jsonl/train/python_train_2.jsonl',
 'final/jsonl/train/python_train_0.jsonl']

In [3]:
# then we read the data into Dataframes for analysis and preparation
df_list = []
for fp in fp_list:
    df_list.append(pd.read_json(path_or_buf=fp, lines=True))


In [4]:
df_list[1].head()

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition
0,ArabellaTech/django-basic-cms,basic_cms/templatetags/pages_tags.py,show_slug_with_level,"def show_slug_with_level(context, page, lang=N...",python,"def show_slug_with_level(context, page, lang=N...","[def, show_slug_with_level, (, context, ,, pag...",Display slug with level by language.,"[Display, slug, with, level, by, language, .]",863f3c6098606f663994930cd8e7723ad0c07caf,https://github.com/ArabellaTech/django-basic-c...,train
1,ArabellaTech/django-basic-cms,basic_cms/templatetags/pages_tags.py,show_revisions,"def show_revisions(context, page, content_type...",python,"def show_revisions(context, page, content_type...","[def, show_revisions, (, context, ,, page, ,, ...",Render the last 10 revisions of a page content...,"[Render, the, last, 10, revisions, of, a, page...",863f3c6098606f663994930cd8e7723ad0c07caf,https://github.com/ArabellaTech/django-basic-c...,train
2,ArabellaTech/django-basic-cms,basic_cms/templatetags/pages_tags.py,do_videoplaceholder,"def do_videoplaceholder(parser, token):\n ""...",python,"def do_videoplaceholder(parser, token):\n ""...","[def, do_videoplaceholder, (, parser, ,, token...",Method that parse the imageplaceholder templat...,"[Method, that, parse, the, imageplaceholder, t...",863f3c6098606f663994930cd8e7723ad0c07caf,https://github.com/ArabellaTech/django-basic-c...,train
3,ArabellaTech/django-basic-cms,basic_cms/templatetags/pages_tags.py,do_get_pages_with_tag,"def do_get_pages_with_tag(parser, token):\n ...",python,"def do_get_pages_with_tag(parser, token):\n ...","[def, do_get_pages_with_tag, (, parser, ,, tok...",Return Pages with given tag\n\n Syntax::\n\...,"[Return, Pages, with, given, tag]",863f3c6098606f663994930cd8e7723ad0c07caf,https://github.com/ArabellaTech/django-basic-c...,train
4,lowandrew/OLCTools,sipprCommon/runMetadata.py,Metadata.parserunstats,"def parserunstats(self):\n """"""Parses th...",python,"def parserunstats(self):\n """"""Parses th...","[def, parserunstats, (, self, ), :, # metadata...",Parses the XML run statistics file (GenerateFA...,"[Parses, the, XML, run, statistics, file, (, G...",88aa90ac85f84d0bbeb03e43c29b0a9d36e4ce2a,https://github.com/lowandrew/OLCTools/blob/88a...,train


In [5]:
# check for duplicates and remove them if necessary

number_of_dps = 0

for df in df_list:
    number_of_dps += len(df)
print("number of data points ====> " + str(number_of_dps))
after = 0

for df in df_list:
    df.drop_duplicates(["docstring", "original_string"])
    after += len(df)
print("number of data points after processing ====> " + str(after))

number of data points ====> 412178
number of data points after processing ====> 412178


In [6]:
# remove all docstrings that are less than five words

after = 0
dfs = []
for df in df_list:
    df = df[df["docstring_tokens"].map(lambda x: len(x) > 4)]
    dfs.append(df)
    after += len(df)
print("number of data points after processing ====> " + str(after))

number of data points after processing ====> 375826


In [7]:
# remove all the __init__() function calls

after = 0
df_list = []
for df in dfs:
    df = df[df["docstring_tokens"].map(lambda x: "__init__" not in x)]
    df_list.append(df)
    after += len(df)
print("number of data points after processing ====> " + str(after))


number of data points after processing ====> 375552


In [24]:
# sort the data in ascending order of function tokens

python_df = pd.concat(df_list)
python_df["len_tok"] = python_df["code_tokens"].map(lambda x: len(x))
python_df.sort_values(by="len_tok", inplace=True)

del python_df["partition"]
del python_df["url"]
del python_df["language"]
del python_df["func_name"]
del python_df["repo"]
del python_df["path"]
del python_df["original_string"]
del python_df["code_tokens"]
del python_df["docstring_tokens"]
del python_df["sha"]
del python_df["len_tok"]

In [26]:
python_df.head()

python_df.to_pickle("python_train.pkl")

In [35]:
from datasets import Dataset


ModuleNotFoundError: No module named 'datasets'