<a href="https://colab.research.google.com/github/JayThibs/gpt-stackoverflow-QA/blob/main/data_preparation/data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparing StackOverflow QA Data

This notebook creates a train, validation, and test sets containing a StackOverflow question with its corresponding top answer.

In [1]:
!nvidia-smi

Thu Dec  9 00:43:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Installations

In [5]:
!pip install xmltodict wget pyunpack patool --quiet

[?25l[K     |████▎                           | 10 kB 18.5 MB/s eta 0:00:01[K     |████████▌                       | 20 kB 13.4 MB/s eta 0:00:01[K     |████████████▊                   | 30 kB 8.6 MB/s eta 0:00:01[K     |█████████████████               | 40 kB 7.7 MB/s eta 0:00:01[K     |█████████████████████▏          | 51 kB 4.2 MB/s eta 0:00:01[K     |█████████████████████████▍      | 61 kB 4.0 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 71 kB 4.6 MB/s eta 0:00:01[K     |████████████████████████████████| 77 kB 2.8 MB/s 
[?25h

# Imports

In [13]:
import os
import re
import wget
from pyunpack import Archive
import xmltodict
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', None)

In [8]:
np.random.seed(3407)

# Getting the Data

In [9]:
url = 'https://archive.org/download/stackexchange/ai.stackexchange.com.7z'
wget.download(url)

'ai.stackexchange.com.7z'

In [51]:
if not os.path.isdir('./data'):
    os.mkdir('./data')
if len(os.listdir('./data') ) == 0:
    Archive('ai.stackexchange.com.7z').extractall("./data")

In [27]:
tree = ET.parse('./data/Posts.xml')
tree = tree.getroot()

In [37]:
xmlstr = ET.tostring(tree, encoding='utf8', method='xml').decode()

In [38]:
soDict = xmltodict.parse(xmlstr)

In [55]:
soDict['posts']['row'][0]

OrderedDict([('@AcceptedAnswerId', '3'),
             ('@AnswerCount', '5'),
             ('@Body',
              '<p>What does "backprop" mean? Is the "backprop" term basically the same as "backpropagation" or does it have a different meaning?</p>\n'),
             ('@CommentCount', '0'),
             ('@ContentLicense', 'CC BY-SA 4.0'),
             ('@CreationDate', '2016-08-02T15:39:14.947'),
             ('@FavoriteCount', '1'),
             ('@Id', '1'),
             ('@LastActivityDate', '2021-07-08T10:45:23.250'),
             ('@LastEditDate', '2019-11-16T17:56:22.093'),
             ('@LastEditorUserId', '2444'),
             ('@OwnerUserId', '8'),
             ('@PostTypeId', '1'),
             ('@Score', '10'),
             ('@Tags',
              '<neural-networks><backpropagation><terminology><definitions>'),
             ('@Title', 'What is "backprop"?'),
             ('@ViewCount', '625')])

In [54]:
soDict['posts']['row'][2]

OrderedDict([('@Body',
              '<p>"Backprop" is the same as "backpropagation": it\'s just a shorter way to say it. It is sometimes abbreviated as "BP".</p>\n'),
             ('@CommentCount', '0'),
             ('@ContentLicense', 'CC BY-SA 3.0'),
             ('@CreationDate', '2016-08-02T15:40:24.820'),
             ('@Id', '3'),
             ('@LastActivityDate', '2016-08-02T15:40:24.820'),
             ('@OwnerUserId', '4'),
             ('@ParentId', '1'),
             ('@PostTypeId', '2'),
             ('@Score', '15')])

In [59]:
len(soDict['posts']['row'])

21482

In [None]:
for i in range(len(soDict['posts']['row'])):
    post = soDict['posts']['row'][i]
    if post['@PostTypeId'] == 1:
        post_id = post['@Id']
        soQADict[post_id]['Question'] = post[idx]['@Body']
        soQADict[post_id]['QuestionScore'] = post[idx]['@Score']
    elif post['@PostTypeId'] == 2:
        post_id = post['@ParentId']
        soQADict[post_id]['BestAnswer'] = post[idx]['@Body']
        soQADict[post_id]['BestAnswerScore'] = post[idx]['@Score']