# Data splitting and cleaning

In [3]:
import pandas as pd
from bs4 import BeautifulSoup
from re import findall, sub

STACK_OVERFLOW_CODE_BLOCK_RE = r"<pre><code>([\s\S]*?)<\/code><\/pre>"


def extract_code_blocks_from_body(body):
    code_block = findall(STACK_OVERFLOW_CODE_BLOCK_RE, body)
    code_block = " ".join(code_block)
    if code_block == "":
        return None
    return code_block


def remove_code_blocks_from_body(body):
    new_body = sub(STACK_OVERFLOW_CODE_BLOCK_RE, "", body)
    return new_body


def remove_new_line(text):
    return text.replace("\n", " ")


## Data splitting

In [67]:
questions_raw = pd.read_csv("./data/Questions.csv", usecols=["Id", "Body", "Title"], encoding="ISO-8859-1").rename(
    columns={"Id": "id", "Body": "body", "Title": "title"}
)
tags_raw = pd.read_csv("./data/Tags.csv", encoding="ISO-8859-1", dtype={"Tags": "string"}).rename(
    columns={"Id": "id", "Tag": "tag"}
)

In [56]:
code_blocks = questions_raw["body"].apply(lambda body: extract_code_blocks_from_body(body)).rename("code")
code_blocks.head(1)

0    Create Table tRole (\n      roleID integer Pri...
Name: code, dtype: object

In [55]:
body_without_code_blocks = questions_raw["body"].apply(lambda body: remove_code_blocks_from_body(body)).rename("desc")
body_without_code_blocks.head(1)

0    <p>I've written a database generation script i...
Name: desc, dtype: object

In [57]:
questions_body_split = questions_raw.drop(columns=["body"]).join(body_without_code_blocks).join(code_blocks)
questions_body_split.head(1)

Unnamed: 0,id,title,desc,code
0,80,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,Create Table tRole (\n roleID integer Pri...


## Data cleaning

In [58]:
questions_cleaned_html = questions_body_split.drop(columns=["desc"]).join(
    questions_body_split["desc"].apply(lambda body: BeautifulSoup(body).get_text())
)

questions_cleaned_html.head(1)

Unnamed: 0,id,title,code,desc
0,80,SQLStatement.execute() - multiple queries in o...,Create Table tRole (\n roleID integer Pri...,I've written a database generation script in S...


In [59]:
questions_cleaned = questions_cleaned_html.drop(columns=["desc"]).join(
    questions_cleaned_html["desc"].apply(lambda body: remove_new_line(body))
)
questions_cleaned.head(1)

Unnamed: 0,id,title,code,desc
0,80,SQLStatement.execute() - multiple queries in o...,Create Table tRole (\n roleID integer Pri...,I've written a database generation script in S...


## Combining tags into a list

In [72]:
tag_lists = tags_raw.groupby("id")["tag"].apply(lambda tags: [tag for tag in tags]).rename('tags')
tag_lists.head(1)

id
80    [flex, actionscript-3, air]
Name: tags, dtype: object

## Combining everything into a X_y_data_frame

In [73]:
X_y_data_frame = questions_cleaned.set_index("id").join(tag_lists).reset_index().drop(columns=["id"])
X_y_data_frame

Unnamed: 0,title,code,desc,tags
0,SQLStatement.execute() - multiple queries in o...,Create Table tRole (\n roleID integer Pri...,I've written a database generation script in S...,"[flex, actionscript-3, air]"
1,Good branching and merging tutorials for Torto...,,Are there any really good tutorials explaining...,"[svn, tortoisesvn, branch, branching-and-merging]"
2,ASP.NET Site Maps,,Has anyone got experience creating SQL-based A...,"[sql, asp.net, sitemap]"
3,Function for creating color wheels,,This is something I've pseudo-solved many time...,"[algorithm, language-agnostic, colors, color-s..."
4,Adding scripting functionality to .NET applica...,ICard Cards[current] = new MyGame.CardLibrary....,I have a little game written in C#. It uses a ...,"[c#, .net, scripting, compiler-construction]"
...,...,...,...,...
1264211,URL routing in PHP (MVC),&lt;IfModule mod_rewrite.c&gt;\n\nRewriteEngin...,I am building a custom MVC project and I have ...,"[php, .htaccess]"
1264212,Bigquery.Jobs.Insert - Resumable Upload?,AbstractInputStreamContent content = new ByteA...,The API docs show that you should be able to m...,[google-bigquery]
1264213,Obfuscating code in android studio,buildTypes {\n release {\n minifyEna...,Under minifyEnabled I changed from false to tr...,"[android, android-studio]"
1264214,How to fire function after v-model change?,var articlesVM = new Vue({\n el: '#search...,I have input which I use to filter my array of...,"[javascript, vue.js]"


In [74]:
X_y_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264216 entries, 0 to 1264215
Data columns (total 4 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   title   1264216 non-null  object
 1   code    840834 non-null   object
 2   desc    1264216 non-null  object
 3   tags    1264216 non-null  object
dtypes: object(4)
memory usage: 38.6+ MB


## Mutation version

In [8]:
questions = pd.read_csv("./data/Questions.csv", usecols=["Id", "Body", "Title"], encoding="ISO-8859-1").rename(
    columns={"Id": "id", "Body": "body", "Title": "title"}
)
tags = pd.read_csv("./data/Tags.csv", encoding="ISO-8859-1", dtype={"Tags": "string"}).rename(
    columns={"Id": "id", "Tag": "tag"}
)

questions["code"] = questions["body"].apply(lambda body: extract_code_blocks_from_body(body)).rename("code")
questions["desc"] = questions["body"].apply(lambda body: remove_code_blocks_from_body(body)).rename("desc")
questions["desc"] = questions["desc"].apply(lambda body: BeautifulSoup(body).get_text())
questions["desc"] = questions["desc"].apply(lambda body: remove_new_line(body))
tag_lists = tags.groupby("id")["tag"].apply(lambda tags: [tag for tag in tags]).rename('tags')
questions = questions.set_index("id").join(tag_lists).reset_index().drop(columns=["id", "body"])
questions.head(1)

Unnamed: 0,title,code,desc,tags
0,SQLStatement.execute() - multiple queries in o...,Create Table tRole (\n roleID integer Pri...,I've written a database generation script in S...,"[flex, actionscript-3, air]"
