# Data splitting and cleaning

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from re import findall, sub

STACK_OVERFLOW_CODE_BLOCK_REGULAR_EXPRESSION = r"<pre><code>([\s\S]*?)<\/code><\/pre>"


def extract_code_blocks_from_body(body):
    code_block = findall(STACK_OVERFLOW_CODE_BLOCK_REGULAR_EXPRESSION, body)
    code_block = " ".join(code_block)
    if code_block == "":
        return None
    return code_block


def remove_code_blocks_from_body(body):
    new_body = sub(STACK_OVERFLOW_CODE_BLOCK_REGULAR_EXPRESSION, "", body)
    return new_body


def remove_new_line_symbol(text):
    return text.replace("\n", " ")


def remove_elements_from_list(*, removing_elements, target_list):
    return [element for element in target_list if element not in removing_elements]

In [6]:
questions = pd.read_csv("./data/Questions.csv", usecols=["Id", "Body", "Title"], encoding="ISO-8859-1").rename(
    columns={"Id": "id", "Body": "body", "Title": "title"}
)
tags = pd.read_csv("./data/Tags.csv", encoding="ISO-8859-1", dtype={"Tags": "string"}).rename(
    columns={"Id": "id", "Tag": "tag"}
)

# Remove all new line characters
questions["body"] = questions["body"].apply(lambda body: remove_new_line_symbol(body))


# Splitting body into code and desc columns
questions["code"] = questions["body"].apply(lambda body: extract_code_blocks_from_body(body)).rename("code")
questions["desc"] = questions["body"].apply(lambda body: remove_code_blocks_from_body(body)).rename("desc")


# Cleaning desc from html elements
questions["desc"] = questions["desc"].apply(lambda body: BeautifulSoup(body).get_text())


# Grouping tags and adding them to each question
tag_lists = tags.groupby("id")["tag"].apply(lambda tags: [tag for tag in tags]).rename("tags")
questions = questions.set_index("id").join(tag_lists).reset_index().drop(columns=["body"])


# Finding and throwing away tags that contain less than 50 samples
questions["tag_counts"] = questions["tags"].apply(lambda tags: len(tags))
tags_with_less_than_50_samples = pd.DataFrame(tags["tag"].value_counts()).query("tag < 50").index
rows_that_only_contain_a_tag_that_has_less_than_50_samples = questions.query("tag_counts == 1")[
    questions.query("tag_counts == 1").apply(lambda row: row[4][0] in tags_with_less_than_50_samples, axis=1).values
].index
questions.drop(rows_that_only_contain_a_tag_that_has_less_than_50_samples, inplace=True)
questions = questions.drop(columns=["tags", "tag_counts", "id"]).join(
    questions["tags"].apply(
        lambda tags: remove_elements_from_list(removing_elements=tags_with_less_than_50_samples, target_list=tags)
    )
)

questions.head(1)

Unnamed: 0,title,code,desc,tags
0,SQLStatement.execute() - multiple queries in o...,Create Table tRole (\n roleID integer Pri...,I've written a database generation script in S...,"[flex, actionscript-3, air]"


In [7]:
questions.to_csv('./data/cleaned_samples.csv')