# Build Datasets

I made the corpus here: [kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/](https://www.kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/)


In [1]:
import pandas as pd
import re
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop = stopwords.words('english')

db_df = pd.read_csv("db/ask-reddit-corpus.csv", index_col=0)

## Feature Extraction

Only Tokens, Lemmas, PoS AND stopword filtering for now.


In [47]:

def extract_features(df: pd.DataFrame, toks=True, lems=True, pos=True, tags=True):

    """
    Extrae los datos necesarios que vamos a utilizar para hacer los diferentes modelos a clasificar.
    """

    #Elimintamos los emojis

    lmtzr = WordNetLemmatizer()

    selected_tags = {"FW","NN","NNS","NNP","NNPS","PDT","RB","RBR","RBS","RP","VB","VBD","VBG","VBN","VBP","VBZ"}

    question_fts = {"DT", "IN", "PRP", "PRP$", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"}
   
    #Filtramos las preguntas demasiado extensas
    db_df = df.copy()[df.Q.apply(lambda x: len(str(x)) < 50)]

    #sdf = df[df.Qscore > 1]

    #sdf = sdf.copy()[sdf.ANS.apply(lambda x: len(str(x)) > 15)]

    #la gente no es tan original y repite preguntas
    db_df = db_df.groupby("Q", as_index=False).first()

    #Borramos los tags de las preguntas que estan marcadas  "[Serious], [NSFW], etc" 
    #se puede guardar esta información.
    db_df["Q"] = db_df["Q"].replace(r"^\[.*\]", "", regex=True)
    db_df["Q"] = db_df["Q"].replace(r"[\"\“\”]", "", regex=True)
    db_df["Q"] = db_df["ANS"].replace(r"[\"\“\”]", "", regex=True)

    #eliminando los emojis
    
    db_df["Q"] = db_df["Q"].replace(emoji_pattern, "")


    #remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')        
    db_df["Qless"] = db_df.Q.str.replace(pattern, '')
    db_df["Aless"] = db_df.ANS.str.replace(pattern, '')


        
    #Obtenemos los tokens
    if toks:
        db_df["Qtoks"] = [word_tokenize(w) for w in db_df["Qless"]]
        db_df["Atoks"] = [word_tokenize(w) for w in db_df["Aless"]]

    #Obtenemos los lemmas
    if lems:
        db_df["Qlemmas"] = [' '.join(lmtzr.lemmatize(t) for t in qes) for qes in db_df["Qtoks"]]
        db_df["Alemmas"] = [' '.join(lmtzr.lemmatize(t) for t in qes) for qes in db_df["Atoks"]]


    # Par Tok & POS
    if pos:
        db_df["Qpos"] = [pos_tag(word_tokenize(w)) for w in db_df["Qlemmas"]]
        db_df["Apos"] = [pos_tag(word_tokenize(w)) for w in db_df["Alemmas"]]
        
    # Topic centric tags
    if tags:
        qtags = []
        for ws in list(db_df["Qpos"]):
            this_tags = ""
            for w in ws:
                if w[1] in selected_tags:
                   this_tags += f"{w[0]}_{w[1]} "
            qtags.append(this_tags)
        db_df["Qkeys"] = qtags

        atags = []
        for ws in list(db_df["Apos"]):
            this_tags = ""
            for w in ws:
                if w[1] in selected_tags:
                   this_tags += f"{w[0]}_{w[1]} "
            atags.append(this_tags)
        db_df["Akeys"] = atags

    # question type centric tags
    if tags:
        qtags = []
        for ws in list(db_df["Qpos"]):
            this_tags = ""
            for w in ws:
                if w[1] in question_fts:
                   this_tags += f"{w[0]}_{w[1]} "
            qtags.append(this_tags)
        db_df["Qtags"] = qtags


    db_df = db_df[["id", "Qscore", "Q", "Qless", "Qlemmas", "Qkeys", "Qtags", "ANSscore", "ANS", "Aless", "Alemmas", "Akeys"]]

    return db_df
    

#usar pickle no sirve para comprimir naranja
df = extract_features(db_df)
df

Unnamed: 0,id,Qscore,Q,Qless,Qlemmas,Qkeys,Qtags,ANSscore,ANS,Aless,Alemmas,Akeys
0,5auv3g,34751,http://i.imgur.com/QEQ0nzI.gifv,http://.imgur.com/QEQ0nzI.gifv,http : //.imgur.com/QEQ0nzI.gifv,http_NN //.imgur.com/QEQ0nzI.gifv_NN,,8208.0,http://i.imgur.com/QEQ0nzI.gifv,http://.imgur.com/QEQ0nzI.gifv,http : //.imgur.com/QEQ0nzI.gifv,http_NN //.imgur.com/QEQ0nzI.gifv_NN
1,2dfyrt,17,When I was 14 it made me the cool rebellious t...,When I 14 made cool rebellious type something ...,When I 14 made cool rebellious type something ...,made_VBD cool_NN type_NN something_NN known_VB...,When_WRB I_PRP made_VBD known_VBN smoke_VB dri...,5.0,When I was 14 it made me the cool rebellious t...,When I 14 made cool rebellious type something ...,When I 14 made cool rebellious type something ...,made_VBD cool_NN type_NN something_NN known_VB...
2,oz9zz2,1,So 16 days ago I entered cum mode by flicking ...,So 16 days ago I entered cum mode flicking swi...,So 16 day ago I entered cum mode flicking swit...,So_RB day_NN entered_VBD cum_NN mode_NN flicki...,ago_IN I_PRP entered_VBD flicking_VBG I_PRP ’_...,1.0,So 16 days ago I entered cum mode by flicking ...,So 16 days ago I entered cum mode flicking swi...,So 16 day ago I entered cum mode flicking swit...,So_RB day_NN entered_VBD cum_NN mode_NN flicki...
3,4edhjp,2,It mainly works on a by game basis nowadays. T...,It mainly works game basis nowadays. Tf2 prett...,It mainly work game basis nowadays . Tf2 prett...,mainly_RB work_VBD game_NN basis_NN nowadays_R...,It_PRP work_VBD mapping_VBG searching_VBG brin...,2.0,It mainly works on a by game basis nowadays. T...,It mainly works game basis nowadays. Tf2 prett...,It mainly work game basis nowadays . Tf2 prett...,mainly_RB work_VBD game_NN basis_NN nowadays_R...
4,1a3bjq,2,Has he been diagnosed by a doctor? That is the...,Has diagnosed doctor? That quickest way settle...,Has diagnosed doctor ? That quickest way settl...,Has_NNP diagnosed_VBN doctor_NN way_NN settle_...,diagnosed_VBN That_DT settle_VB whether_IN,3.0,Has he been diagnosed by a doctor? That is the...,Has diagnosed doctor? That quickest way settle...,Has diagnosed doctor ? That quickest way settl...,Has_NNP diagnosed_VBN doctor_NN way_NN settle_...
...,...,...,...,...,...,...,...,...,...,...,...,...
91514,orgpc4,1,"Observing, attention, memory,","Observing, attention, memory,","Observing , attention , memory ,",Observing_NN attention_NN memory_NN,,4.0,"Observing, attention, memory,","Observing, attention, memory,","Observing , attention , memory ,",Observing_NN attention_NN memory_NN
91515,og7m7k,1,Why is this a quote?,Why quote?,Why quote ?,quote_NN,Why_WRB,2.0,Why is this a quote?,Why quote?,Why quote ?,quote_NN
91516,olnost,1,whyd you put your question in quotes?,whyd put question quotes?,whyd put question quote ?,whyd_NN put_VBD question_NN quote_NN,put_VBD,1.0,whyd you put your question in quotes?,whyd put question quotes?,whyd put question quote ?,whyd_NN put_VBD question_NN quote_NN
91517,ova17m,1,Dm me,Dm,Dm,Dm_NN,,1.0,Dm me,Dm,Dm,Dm_NN


In [48]:
df.to_csv("db/features.csv")

# Building the Test Dataset for Clustering Comparition

Small manual clusters to get a little sence if clustering is working

Esto quizas es muy muy tonto, pero solamente ordenar alfabeticamente las preguntas es una buena forma de encontrar similaridad.


In [29]:
import pandas as pd

df = pd.read_csv("db/features.csv", index_col=0)

n_clus = 5 #no cambiar sin agregar otro
n_sam = 100

df = df.groupby("Q", as_index=False).first()

#testeo de algunos "tipos" de preguntas precatios y de concepto
q_ids = []
q_ids += list(df[df.Q.str.contains(r'favorite movie')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'ever seen')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'advice')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'history')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'book')].id.sample(n_sam))

# Mostramos un poquito como queda
# for j in range(n_clus):
#    print(f"\ncluster {j}")
#    for i in range(n_sam):
#       print(f"   {df.loc[q_ids[j*n_clus+i]].Q}")


q_ids = [(x, int(i/n_sam)) for i, x in enumerate(q_ids)] 

total = n_clus*n_sam

test_db = []
for i in range(total):
    for j in range(1, total-i):
        este = q_ids[i]
        otro = q_ids[j+i]
        test_db += [[este[0], otro[0], este[1] == otro[1]]]


test_db = pd.DataFrame(test_db)
test_db = test_db[test_db[0] != test_db[1]]


In [30]:
test_db

Unnamed: 0,0,1,2
0,ot7072,or2de9,True
1,ot7072,19cphd,True
2,ot7072,axdit/,True
3,ot7072,okem4v,True
4,ot7072,jrbzaq,True
...,...,...,...
124745,g9dsti,owgzad,True
124746,g9dsti,oov8sa,True
124747,fwm66y,owgzad,True
124748,fwm66y,oov8sa,True


In [31]:
test_db.to_csv("db/test_db.csv")

In [42]:
emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002500-\U00002BEF"  # chinese char
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"  # dingbats
                            u"\u3030"
                            "]+", flags=re.UNICODE)