# Dataset Creation

Corpus from here: [kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/](https://www.kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/)


In [7]:
import pandas as pd
import re
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop = stopwords.words('english')


db_df = pd.read_csv("db/ask-reddit-corpus.csv", index_col=0)

## Feature Extraction

Only Tokens, Lemmas, PoS AND stopword filtering for now.


In [8]:

def extract_features(df: pd.DataFrame, toks=True, lems=True, pos=True):

    """
    idea: las collocations
    """
    lmtzr = WordNetLemmatizer()
   
    #Filtramos las preguntas demasiado extensas
    db_df = df.copy()[df.Q.apply(lambda x: len(str(x)) <50)]

    #remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')        
    db_df["Qless"] = db_df.Q.str.replace(pattern, '')
    db_df["Aless"] = db_df.ANS.str.replace(pattern, '')
        
    #Obtenemos los tokens
    if toks:
        db_df["Qtoks"] = [word_tokenize(w) for w in db_df["Qless"]]
        db_df["Atoks"] = [word_tokenize(w) for w in db_df["Aless"]]

    #Obtenemos los lemmas
    if lems:
        db_df["Qlemmas"] = [' '.join(lmtzr.lemmatize(t) for t in qes) for qes in db_df["Qtoks"]]
        db_df["Alemmas"] = [' '.join(lmtzr.lemmatize(t) for t in qes) for qes in db_df["Atoks"]]


    # Par Tok & POS
    if pos:
        db_df["Qpos"] = [pos_tag(word_tokenize(w)) for w in db_df["Qless"]]
        db_df["Apos"] = [pos_tag(word_tokenize(w)) for w in db_df["Aless"]]

    
    
    db_df = db_df[["id", "Q", "Qscore", "Qless", "Qlemmas", "Qpos", "Qtoks", "ANS", "ANSscore", "Aless", "Alemmas", "Apos", "Atoks"]]

    return db_df
    

#usar pickle no sirve para comprimir naranja
df = extract_features(db_df)

## Further Filtering

como tiene cerca de ~100k entradas vamos a filtrar un poco 

In [None]:

#solo consideramos con las preguntas con 2 o más upvotes (~32k)
sdf = df[df.Qscore > 1]

#filtramos las respuestas muy cortas, nos deja un total de 25k. 
sdf = sdf.copy()[sdf.ANS.apply(lambda x: len(str(x)) > 15)]

In [None]:
df.to_csv("db/features.csv")