Add the root path to the sys path so that we can import the modules.

In [1]:
import sys

ROOT_PATH = ".."

sys.path.append(ROOT_PATH)

### 1. Preprecessing

In this step, we firstly generate attributes used in our filtering framework, including:

- ```text```: str, content of files
- ```filename```: str, name of files
- ```lang```: str, language type of files
- ```ext```: str, file extension of files
- ```file_size_in_byte```: int, file volume of files (units are byte)
- ```program_lang```: str, programming language type of files
- ```doc_type```: str, documentation type, the super categories based on programming language types, includes "code", "data" and "text"

In [2]:
import os
import fasttext
from utils.preprocessing import get_program_lang, get_doc_type
import pandas as pd


# generate a dataframe of attributes for each code file
def generate_attributes_df(raw_code_dir, lang_predictor):

    code_list = []
    for root, dirs, files in os.walk(raw_code_dir):
        for file in files:
            cur_code_dict = {}
            name, ext = os.path.splitext(file)
            ext = ext[1:]
            # file extension
            cur_code_dict['ext'] = ext

            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                text = f.read()
            # file content
            cur_code_dict['text'] = text

            # file name
            cur_code_dict['filename'] = name

            # language
            predictions = lang_predictor.predict(text.lower().replace("\n", " "))
            lang = predictions[0][0].replace('__label__', '')
            cur_code_dict['lang'] = lang

            # file size in byte
            file_size_in_byte = os.path.getsize(file_path)
            cur_code_dict['file_size_in_byte'] = file_size_in_byte

            # programming language
            cur_code_dict['program_lang'] = get_program_lang(name, ext)

            # documentation type
            cur_code_dict['doc_type'] = get_doc_type(cur_code_dict['program_lang'])

            code_list.append(cur_code_dict)

    code_df = pd.DataFrame(code_list)

    return code_df


raw_code_dir = f'{ROOT_PATH}/test_data/raw_code' # your source code directory
lang_predictor = fasttext.load_model(f'{ROOT_PATH}/artifacts/lang_predictor.bin') # fasttext language predictor

code_df = generate_attributes_df(raw_code_dir, lang_predictor)

print(f"code_df.shape: {code_df.shape}")
code_df.head()

code_df.shape: (120, 7)




Unnamed: 0,ext,text,filename,lang,file_size_in_byte,program_lang,doc_type
0,py,#coding=utf-8\n#上面一句是定义python的编码，必须写在第一句\n\nfr...,Dm015_dataframe_sum,en,648,python,code
1,py,"""""""passlib.crypto._blowfish.base - unoptimized...",base,en,20390,python,code
2,py,"# coding=utf-8\n""""""\ndesc..\n :copyright: (...",app,en,3356,python,code
3,py,# -*- coding: utf-8 -*-\n#\n# Copyright (C) 20...,sysconfig,en,26955,python,code
4,py,/usr/lib/python2.7/copy_reg.py,copy_reg,pt,30,python,code


### 2. Qualtiy Signal Computing

In this step, we compute the quality signal of each file.

In [3]:
from pipeline.compute_quality_signals import ComputeCodeQualitySignal
from tqdm import tqdm
import json


def compute_qs(row, ccqs: ComputeCodeQualitySignal):
    
    final_result = ccqs.evaluate(
                    text=row['text'],
                    filename=row['filename'],
                    lang=row['lang'],
                    ext=row['ext'],
                    file_size_in_byte=row['file_size_in_byte'],
                    program_lang=row['program_lang'],
                    doc_type=row['doc_type'],
                )

    return final_result

ccqs = ComputeCodeQualitySignal()
tqdm.pandas()
qsc_results = code_df.progress_apply(compute_qs, axis=1 ,args=(ccqs,))
qsc_scores = [json.loads(line)["quality_signal"] for line in qsc_results.tolist()]
qsc_score_df = pd.concat([code_df, pd.DataFrame({"quality_signal": qsc_scores})], axis=1)
qsc_score_df.head(5)

100%|██████████| 120/120 [00:01<00:00, 109.09it/s]


Unnamed: 0,ext,text,filename,lang,file_size_in_byte,program_lang,doc_type,quality_signal
0,py,#coding=utf-8\n#上面一句是定义python的编码，必须写在第一句\n\nfr...,Dm015_dataframe_sum,en,648,python,code,"{""qsc_code_num_words"": 87, ""qsc_code_num_chars..."
1,py,"""""""passlib.crypto._blowfish.base - unoptimized...",base,en,20390,python,code,"{""qsc_code_num_words"": 1661, ""qsc_code_num_cha..."
2,py,"# coding=utf-8\n""""""\ndesc..\n :copyright: (...",app,en,3356,python,code,"{""qsc_code_num_words"": 459, ""qsc_code_num_char..."
3,py,# -*- coding: utf-8 -*-\n#\n# Copyright (C) 20...,sysconfig,en,26955,python,code,"{""qsc_code_num_words"": 3245, ""qsc_code_num_cha..."
4,py,/usr/lib/python2.7/copy_reg.py,copy_reg,pt,30,python,code,"{""qsc_code_num_words"": 7, ""qsc_code_num_chars""..."


### 3. Filtering

In this step, we do the filtering process based on the quality signal of each file.

Result for one file:
- ```effective```: Whether this file is retained or not.
- ```hit_map```: The hit status of each filtering rule.
- ```err_msg```: [optional] Recording the error message if exists.

In [4]:
from pipeline.compute_filtering import CodeFilter

def filter_code(row, code_filter: CodeFilter):

    final_result = code_filter.evaluate(
                    doc_type=row['doc_type'],
                    lang=row['lang'],
                    program_lang=row['program_lang'],
                    quality_signal=row['quality_signal']
                )

    return final_result

code_filter = CodeFilter()
tqdm.pandas()
filtered_results = qsc_score_df.progress_apply(filter_code, axis=1 ,args=(code_filter,)).tolist()
filtered_results = [json.loads(line) for line in filtered_results]

100%|██████████| 120/120 [00:00<00:00, 3544.96it/s]


In [5]:
# get the final clean code files and save to the targeted directory

clean_code_df = pd.concat([qsc_score_df, pd.DataFrame(filtered_results)], axis=1)
clean_code_df = clean_code_df[clean_code_df['effective'] == '1']

print(f'clean_code_df.shape: {clean_code_df.shape}')

target_dir = f"{ROOT_PATH}/test_data/clean_code"

for index, row in clean_code_df.iterrows():
    if row["effective"]:
        cur_dir = f"{target_dir}/{row['program_lang']}"
        os.makedirs(cur_dir, exist_ok=True)
        file_path = os.path.join(cur_dir, row['filename'])
        with open(file_path, 'w') as f:
            f.write(row['text'])

print(f'Successfully saved {clean_code_df.shape[0]} clean code files to path: {target_dir}')

clean_code_df.shape: (70, 10)
Successfully saved 70 clean code files to path: ../test_data/clean_code
