# Build Datasets

I made the corpus here: [kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/](https://www.kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/)


In [53]:
import pandas as pd
import re
import string
from patterns import stop_pattern, emoji_pattern, nostop_pattern
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

db_df = pd.read_csv("db/ask-reddit-corpus.csv", index_col=0)

## Feature Extraction

Only Tokens, Lemmas, PoS AND stopword filtering for now.


In [48]:

def extract_features(df: pd.DataFrame, tags=False):

    """
    Extrae los datos necesarios que vamos a utilizar para hacer los diferentes modelos a clasificar.
    """

    #Elimintamos los emojis

    lmtzr = WordNetLemmatizer()

    selected_tags = {"DT", "IN", "PRP", "PRP$", "FW","NN","NNS","NNP","NNPS","PDT","RB","RBR","RBS","RP","VB","VBD","VBG","VBN","VBP","VBZ", "WDT", "WP", "WP$", "WRB"}
   
    #Filtramos las preguntas demasiado extensas y las vacias
    db_df = df.copy()[df.Q.apply(lambda x: len(str(x)) < 50)]
    db_df = db_df.copy()[db_df.Q.apply(lambda x: len(str(x)) > 0)]
    #sdf = df[df.Qscore > 1]
    #sdf = sdf.copy()[sdf.ANS.apply(lambda x: len(str(x)) > 15)]

    #eliminando los emojis
    db_df["Q"] = db_df["Q"].replace(emoji_pattern, "")

    #Trabajamos uncased
    db_df["Q"] = [str.lower(s) for s in list(db_df.Q)]

    #la gente no es tan original y repite preguntas
    db_df = db_df.groupby("Q", as_index=False).first()

    #Removemos los URLS
    db_df["Q"] = db_df["Q"].str.replace("http\S+", "")
    db_df["ANS"] = db_df["ANS"].str.replace("http\S+", "")

    #Borramos los tags de las preguntas que estan marcadas  "[Serious], [NSFW], etc" 
    #se puede guardar esta información.
    db_df["Q"] = db_df["Q"].replace(r"^\[.*\]", "", regex=True)
    db_df["Q"] = db_df["Q"].replace(r"[\"\“\”]", "", regex=True)
    db_df["ANS"] = db_df["ANS"].replace(r"[\"\“\”]", "", regex=True)

    #remuevo punctiation
    translator = str.maketrans('','',string.punctuation)
    db_df["Qclean"] = db_df.Q.str.translate(translator) 
    db_df["Aclean"] = db_df.ANS.str.translate(translator)

    db_df["Qtoks"] = [word_tokenize(w) for w in db_df["Qclean"]]
    db_df["Atoks"] = [word_tokenize(w) for w in db_df["Aclean"]]

    db_df["Qtoks"] = [[lmtzr.lemmatize(t) for t in qes] for qes in db_df["Qtoks"]]
    db_df["Atoks"] = [[lmtzr.lemmatize(t) for t in qes] for qes in db_df["Atoks"]]

    db_df["Qnostp"] = db_df["Qtoks"].apply(lambda x: [re.sub(stop_pattern, "", y) for y in x if re.sub(stop_pattern, "", y) != ""])
    db_df["Anostp"] = db_df["Atoks"].apply(lambda x: [re.sub(stop_pattern, "", y) for y in x if re.sub(stop_pattern, "", y) != ""])
    
    db_df["Qstp"] = db_df[['Qtoks','Qnostp']].apply(lambda x: [i for i in x[0] if i not in x[1]], axis=1)
    db_df["Astp"] = db_df[['Atoks','Anostp']].apply(lambda x: [i for i in x[0] if i not in x[1]], axis=1)


    # Par lemma & POS para 
    db_df["Qkeys"] = db_df["Qnostp"]
    db_df["Akeys"] = db_df["Anostp"]
        

    # Filtered nltk PoS and lemma
    if tags:
        qtags = []
        for ws in list(db_df["Qpos"]):
            this_tags = ""
            for w in ws:
                if w[1] in selected_tags:
                   this_tags += f"{w[0]}_{w[1]} "
            qtags.append(this_tags)
        db_df["Qkeys"] = qtags

        atags = []
        for ws in list(db_df["Apos"]):
            this_tags = ""
            for w in ws:
                if w[1] in selected_tags:
                   this_tags += f"{w[0]}_{w[1]} "
            atags.append(this_tags)
        db_df["Akeys"] = atags



    db_df = db_df[["id", "Q", "Qclean", "Qtoks", "Qstp", "Qkeys", "ANS", "Aclean", "Atoks", "Astp", "Akeys"]]

    return db_df
    

#usar pickle no sirve para comprimir naranja
df = extract_features(db_df)
df

  db_df["Q"] = db_df["Q"].str.replace("http\S+", "")
  db_df["ANS"] = db_df["ANS"].str.replace("http\S+", "")


Unnamed: 0,id,Q,Qclean,Qtoks,Qstp,Qkeys,ANS,Aclean,Atoks,Astp,Akeys
0,ojjgvy,"people that eat spicy food, why?",people that eat spicy food why,"[people, that, eat, spicy, food, why]","[that, why]","[people, eat, spicy, food]","I never liked spicy food, it just adds a pain ...",I never liked spicy food it just adds a pain e...,"[I, never, liked, spicy, food, it, just, add, ...","[it, just, a, that, i]","[I, never, liked, spicy, food, add, pain, elem..."
1,bc60pk,"donut or doughnut, & where are you from?",donut or doughnut where are you from,"[donut, or, doughnut, where, are, you, from]","[or, where, are, you, from]","[donut, doughnut]","donut, southern usa",donut southern usa,"[donut, southern, usa]",[],"[donut, southern, usa]"
2,289mmm,help finding a song from a video description?,help finding a song from a video description,"[help, finding, a, song, from, a, video, descr...","[a, from, a]","[help, finding, song, video, description]",/r/tipofmytongue,rtipofmytongue,[rtipofmytongue],[],[rtipofmytongue]
3,rufpr/,"i was rapedno, we had sex",i was rapedno we had sex,"[i, wa, rapedno, we, had, sex]","[i, we, had]","[wa, rapedno, sex]","if its not a yes, its no.",if its not a yes its no,"[if, it, not, a, yes, it, no]","[if, it, not, a, it, no]",[yes]
4,24vtw3,jesus h. christ what's his middle name?,jesus h christ whats his middle name,"[jesus, h, christ, whats, his, middle, name]",[his],"[jesus, h, christ, whats, middle, name]",umm... Holy. As in Holy Christ. Isn't this com...,umm Holy As in Holy Christ Isnt this common kn...,"[umm, Holy, As, in, Holy, Christ, Isnt, this, ...","[in, this]","[umm, Holy, As, Holy, Christ, Isnt, common, kn..."
...,...,...,...,...,...,...,...,...,...,...,...
90149,oqcloa,¿what easy work is really difficult?,¿what easy work is really difficult,"[¿what, easy, work, is, really, difficult]","[¿what, is]","[¿, easy, work, really, difficult]",None by virtue of it being easy in the first p...,None by virtue of it being easy in the first p...,"[None, by, virtue, of, it, being, easy, in, th...","[by, of, it, being, in, the]","[None, virtue, easy, first, place]"
90150,ooswef,daddy what does nsfw mean?,daddy what does nsfw mean,"[daddy, what, doe, nsfw, mean]",[what],"[daddy, doe, nsfw, mean]",not safe for work,not safe for work,"[not, safe, for, work]","[not, for]","[safe, work]"
90151,orgpc4,"quiet people, what are your strengths?",quiet people what are your strengths,"[quiet, people, what, are, your, strength]","[what, are, your]","[quiet, people, strength]","Observing, attention, memory,",Observing attention memory,"[Observing, attention, memory]",[],"[Observing, attention, memory]"
90152,og7m7k,what is the best indian movie?,what is the best indian movie,"[what, is, the, best, indian, movie]","[what, is, the]","[best, indian, movie]",Why is this a quote?,Why is this a quote,"[Why, is, this, a, quote]","[is, this, a]","[Why, quote]"


In [50]:
df = df.copy()
df["QA-keys"] = df["Qkeys"]
df["QA-keys"] += df["Akeys"]

df.to_csv("db/features.csv")

# Building the Test Dataset for Clustering Comparition

Small manual clusters to get a little sence if clustering is working

Esto quizas es muy muy tonto, pero solamente ordenar alfabeticamente las preguntas es una buena forma de encontrar similaridad.


In [51]:
import pandas as pd

df = pd.read_csv("db/features.csv", index_col=0)

n_clus = 5 #no cambiar sin agregar otro
n_sam = 100

df = df.groupby("Q", as_index=False).first()

#testeo de algunos "tipos" de preguntas precatios y de concepto
q_ids = []
q_ids += list(df[df.Q.str.contains(r'favorite movie')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'ever seen')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'advice')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'history')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'book')].id.sample(n_sam))

# Mostramos un poquito como queda
# for j in range(n_clus):
#    print(f"\ncluster {j}")
#    for i in range(n_sam):
#       print(f"   {df.loc[q_ids[j*n_clus+i]].Q}")


q_ids = [(x, int(i/n_sam)) for i, x in enumerate(q_ids)] 

total = n_clus*n_sam

test_db = []
for i in range(total):
    for j in range(1, total-i):
        este = q_ids[i]
        otro = q_ids[j+i]
        test_db += [[este[0], otro[0], este[1] == otro[1]]]


test_db = pd.DataFrame(test_db)
test_db = test_db[test_db[0] != test_db[1]]


In [44]:
test_db

Unnamed: 0,0,1,2
0,opzuxo,2gzw73,True
1,opzuxo,2oyved,True
2,opzuxo,oha2b6,True
3,opzuxo,olnvc7,True
4,opzuxo,onkxtm,True
...,...,...,...
124745,gl67e/,1zfgll,True
124746,gl67e/,ouu0nd,True
124747,oh82r7,1zfgll,True
124748,oh82r7,ouu0nd,True


In [52]:
test_db.to_csv("db/test_db.csv")

In [46]:
df.Q.head(30)

0          29 year old male stuck in cum mode!
1             active community for source sdk?
2          alcoholic brother claims cirrhosis?
3             an end to no sexual topics week.
4                   announcing our newest tag!
5           any advice for a new kitten owner?
6                 any cool but good nicknames?
7              anyone traveling from nm to ut?
8                  are we as a species doomed?
9         are you pro-choice or pro-life? why?
10     are you proud of yourself? if not, why?
11                  askreddit gets a facelift!
12                    askreddit rules reminder
13       askreddit, what do you think of this?
14               average movie theater salary?
15                   back to school megathread
16             bronies/furries of reddit… why?
17                 can you jump start a prius?
18      cheaters on reddit, why did you cheat?
19                       cl 63 amg or c63 amg?
20                      college or trade? why?
21           