# Build Datasets

I made the corpus here: [kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/](https://www.kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/)


In [1]:
import pandas as pd
import re
import string
from patterns import stop_pattern, emoji_pattern, nostop_pattern
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

db_df = pd.read_csv("db/ask-reddit-corpus.csv", index_col=0)

## Feature Extraction

Only Tokens, Lemmas, PoS AND stopword filtering for now.


In [2]:

def extract_features(df: pd.DataFrame, tags=False):

    """
    Extrae los datos necesarios que vamos a utilizar para hacer los diferentes modelos a clasificar.
    """

    #Elimintamos los emojis

    lmtzr = WordNetLemmatizer()

    selected_tags = {"DT", "IN", "PRP", "PRP$", "FW","NN","NNS","NNP","NNPS","PDT","RB","RBR","RBS","RP","VB","VBD","VBG","VBN","VBP","VBZ", "WDT", "WP", "WP$", "WRB"}
   
    #Filtramos las preguntas demasiado extensas y las vacias
    db_df = df.copy()[df.Q.apply(lambda x: len(str(x)) < 50)]
    db_df = db_df.copy()[db_df.Q.apply(lambda x: len(str(x)) > 0)]
    db_df = db_df.copy()[db_df.Qscore > 1]
    #sdf = sdf.copy()[sdf.ANS.apply(lambda x: len(str(x)) > 15)]

    #eliminando los emojis
    db_df["Q"] = db_df["Q"].replace(emoji_pattern, "")

    #Trabajamos uncased
    db_df["Q"] = [str.lower(s) for s in list(db_df.Q)]

    #la gente no es tan original y repite preguntas
    db_df = db_df.groupby("Q", as_index=False).first()

    #Removemos los URLS
    db_df["Q"] = db_df["Q"].str.replace("http\S+", "", regex=True)
    db_df["ANS"] = db_df["ANS"].str.replace("http\S+", "", regex=True)

    #Borramos los tags de las preguntas que estan marcadas  "[Serious], [NSFW], etc" 
    #se puede guardar esta información.
    db_df["Q"] = db_df["Q"].replace(r"^\[.*\]", "", regex=True)
    db_df["Q"] = db_df["Q"].replace(r"[\"\“\”]", "", regex=True)
    db_df["ANS"] = db_df["ANS"].replace(r"[\"\“\”]", "", regex=True)

    #remuevo punctiation
    translator = str.maketrans('','',string.punctuation)
    db_df["Qclean"] = db_df.Q.str.translate(translator) 
    db_df["Aclean"] = db_df.ANS.str.translate(translator)

    db_df["Qtoks"] = [word_tokenize(w) for w in db_df["Qclean"]]
    db_df["Atoks"] = [word_tokenize(w) for w in db_df["Aclean"]]

    db_df["Qtoks"] = [[lmtzr.lemmatize(t) for t in qes] for qes in db_df["Qtoks"]]
    db_df["Atoks"] = [[lmtzr.lemmatize(t) for t in qes] for qes in db_df["Atoks"]]

    db_df["Qnostp"] = db_df["Qtoks"].apply(lambda x: [re.sub(stop_pattern, "", y) for y in x if re.sub(stop_pattern, "", y) != ""])
    db_df["Anostp"] = db_df["Atoks"].apply(lambda x: [re.sub(stop_pattern, "", y) for y in x if re.sub(stop_pattern, "", y) != ""])
    
    db_df["Qstp"] = db_df[['Qtoks','Qnostp']].apply(lambda x: [i for i in x[0] if i not in x[1]], axis=1)
    db_df["Astp"] = db_df[['Atoks','Anostp']].apply(lambda x: [i for i in x[0] if i not in x[1]], axis=1)


    # Par lemma & POS para 
    db_df["Qkeys"] = db_df["Qnostp"]
    db_df["Akeys"] = db_df["Anostp"]
        

    # Filtered nltk PoS and lemma
    if tags:
        qtags = []
        for ws in list(db_df["Qpos"]):
            this_tags = ""
            for w in ws:
                if w[1] in selected_tags:
                   this_tags += f"{w[0]}_{w[1]} "
            qtags.append(this_tags)
        db_df["Qkeys"] = qtags

        atags = []
        for ws in list(db_df["Apos"]):
            this_tags = ""
            for w in ws:
                if w[1] in selected_tags:
                   this_tags += f"{w[0]}_{w[1]} "
            atags.append(this_tags)
        db_df["Akeys"] = atags



    db_df = db_df[["id", "Q", "Qclean", "Qtoks", "Qstp", "Qkeys", "ANS", "Aclean", "Atoks", "Astp", "Akeys"]]

    return db_df
    

#usar pickle no sirve para comprimir naranja
df = extract_features(db_df)
df

Unnamed: 0,id,Q,Qclean,Qtoks,Qstp,Qkeys,ANS,Aclean,Atoks,Astp,Akeys
0,bc60pk,"donut or doughnut, & where are you from?",donut or doughnut where are you from,"[donut, or, doughnut, where, are, you, from]","[or, where, are, you, from]","[donut, doughnut]","donut, southern usa",donut southern usa,"[donut, southern, usa]",[],"[donut, southern, usa]"
1,289mmm,help finding a song from a video description?,help finding a song from a video description,"[help, finding, a, song, from, a, video, descr...","[a, from, a]","[help, finding, song, video, description]",/r/tipofmytongue,rtipofmytongue,[rtipofmytongue],[],[rtipofmytongue]
2,rufpr/,"i was rapedno, we had sex",i was rapedno we had sex,"[i, wa, rapedno, we, had, sex]","[i, we, had]","[wa, rapedno, sex]","if its not a yes, its no.",if its not a yes its no,"[if, it, not, a, yes, it, no]","[if, it, not, a, it, no]",[yes]
3,24vtw3,jesus h. christ what's his middle name?,jesus h christ whats his middle name,"[jesus, h, christ, whats, his, middle, name]",[his],"[jesus, h, christ, whats, middle, name]",umm... Holy. As in Holy Christ. Isn't this com...,umm Holy As in Holy Christ Isnt this common kn...,"[umm, Holy, As, in, Holy, Christ, Isnt, this, ...","[in, this]","[umm, Holy, As, Holy, Christ, Isnt, common, kn..."
4,jg8san,runaways why did you run?,runaways why did you run,"[runaway, why, did, you, run]","[why, did, you]","[runaway, run]","That's how a race works. I mean, I lost, but s...",Thats how a race works I mean I lost but still,"[Thats, how, a, race, work, I, mean, I, lost, ...","[how, a, but]","[Thats, race, work, I, mean, I, lost, still]"
...,...,...,...,...,...,...,...,...,...,...,...
30600,tx1ae/,zombie puns needed,zombie puns needed,"[zombie, pun, needed]",[],"[zombie, pun, needed]",Why did the Zombie cross the road? BRAINS! W...,Why did the Zombie cross the road BRAINS Wha...,"[Why, did, the, Zombie, cross, the, road, BRAI...","[did, the, the, do, we, do, we, it]","[Why, Zombie, cross, road, BRAINS, What, want,..."
30601,ds6tpc,"zoo employees, what was the uh oh moment?",zoo employees what was the uh oh moment,"[zoo, employee, what, wa, the, uh, oh, moment]","[what, the]","[zoo, employee, wa, uh, oh, moment]",uh oh s t i n k y,uh oh s t i n k y,"[uh, oh, s, t, i, n, k, y]","[s, t, i, y]","[uh, oh, n, k]"
30602,i53cd6,zoomers how it is to meet the boomer?,zoomers how it is to meet the boomer,"[zoomers, how, it, is, to, meet, the, boomer]","[how, it, is, to, the]","[zoomers, meet, boomer]",there a lot like millenials i'll be honest,there a lot like millenials ill be honest,"[there, a, lot, like, millenials, ill, be, hon...","[there, a, be]","[lot, like, millenials, ill, honest]"
30603,dzepth,"zoomers of reddit, where are we zooming to?",zoomers of reddit where are we zooming to,"[zoomers, of, reddit, where, are, we, zooming,...","[of, where, are, we, to]","[zoomers, reddit, zooming]",the one pixel on a square trying to see if i s...,the one pixel on a square trying to see if i s...,"[the, one, pixel, on, a, square, trying, to, s...","[the, on, a, to, if, i, should, the, or, not]","[one, pixel, square, trying, see, select, square]"


In [3]:
df = df.copy()
df["QA-keys"] = df["Qkeys"]
df["QA-keys"] += df["Akeys"]

#Qkeys nonstop lemmas
df["Q-kstr"] =  df["Qkeys"].apply(lambda x: " ".join(a for a in x))
df["QA-kstr"] =  df["Qkeys"].apply(lambda x: " ".join(a for a in x))
df["Q-stpstr"] =  df["Qkeys"].apply(lambda x: " ".join(a for a in x))

df.to_csv("db/features.csv")

# Building the Test Dataset for Clustering Comparition

Small manual clusters to get a little sence if clustering is working

Esto quizas es muy muy tonto, pero solamente ordenar alfabeticamente las preguntas es una buena forma de encontrar similaridad.


In [4]:
import pandas as pd

df = pd.read_csv("db/features.csv", index_col=0)

n_clus = 5 #no cambiar sin agregar otro
n_sam = 50

df = df.groupby("Q", as_index=False).first()

#testeo de algunos "tipos" de preguntas precatios y de concepto
q_ids = []
q_ids += list(df[df.Q.str.contains(r'favorite movie')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'ever seen')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'advice')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'history')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'book')].id.sample(n_sam))

# Mostramos un poquito como queda
# for j in range(n_clus):
#    print(f"\ncluster {j}")
#    for i in range(n_sam):
#       print(f"   {df.loc[q_ids[j*n_clus+i]].Q}")


q_ids = [(x, int(i/n_sam)) for i, x in enumerate(q_ids)] 

total = n_clus*n_sam

test_db = []
for i in range(total):
    for j in range(1, total-i):
        este = q_ids[i]
        otro = q_ids[j+i]
        test_db += [[este[0], otro[0], este[1] == otro[1]]]


test_db = pd.DataFrame(test_db)
test_db = test_db[test_db[0] != test_db[1]]


In [5]:
test_db

Unnamed: 0,0,1,2
0,f1p1cm,2lorca,True
1,f1p1cm,2jvsu8,True
2,f1p1cm,jmv8f8,True
3,f1p1cm,nhuoi/,True
4,f1p1cm,ht7ag8,True
...,...,...,...
31120,34m5n6,k27rtt,True
31121,34m5n6,cclwl4,True
31122,2l88r4,k27rtt,True
31123,2l88r4,cclwl4,True


In [6]:
test_db.to_csv("db/test_db.csv")

In [15]:
df.Q.head(30)

0             active community for source sdk?
1          alcoholic brother claims cirrhosis?
2             an end to no sexual topics week.
3                   announcing our newest tag!
4                 any cool but good nicknames?
5              anyone traveling from nm to ut?
6                  are we as a species doomed?
7         are you pro-choice or pro-life? why?
8                   askreddit gets a facelift!
9                     askreddit rules reminder
10       askreddit, what do you think of this?
11                   back to school megathread
12                 can you jump start a prius?
13                      college or trade? why?
14            compulsive liars of reddit, why?
15                        discount beef jerky!
16                      do you like your name?
17                     does hypnotherapy work?
18        ex-religous people, why'd you leave?
19      fat people, what is it like to be fat?
20                   fav music artist and why?
21           