# Build Datasets

I made the corpus here: [kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/](https://www.kaggle.com/mauroebordon/creating-a-qa-corpus-from-askreddit/)


In [4]:
import pandas as pd
import re
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop = stopwords.words('english')

db_df = pd.read_csv("db/ask-reddit-corpus.csv", index_col=0)

## Feature Extraction

Only Tokens, Lemmas, PoS AND stopword filtering for now.


In [5]:

def extract_features(df: pd.DataFrame, toks=True, lems=True, pos=True, tags=True):

    """
    Extrae los datos necesarios que vamos a utilizar para hacer los diferentes modelos a clasificar.
    """

    #Elimintamos los emojis

    lmtzr = WordNetLemmatizer()

    selected_tags = {"DT", "IN", "PRP", "PRP$", "FW","NN","NNS","NNP","NNPS","PDT","RB","RBR","RBS","RP","VB","VBD","VBG","VBN","VBP","VBZ", "WDT", "WP", "WP$", "WRB"}


   
    #Filtramos las preguntas demasiado extensas
    db_df = df.copy()[df.Q.apply(lambda x: len(str(x)) < 50)]
    db_df = db_df.copy()[db_df.Q.apply(lambda x: len(str(x)) > 0)]


    #sdf = df[df.Qscore > 1]

    #sdf = sdf.copy()[sdf.ANS.apply(lambda x: len(str(x)) > 15)]

    #la gente no es tan original y repite preguntas
    db_df = db_df.groupby("Q", as_index=False).first()

    #Removemos los URLS
    db_df["Q"] = db_df["Q"].str.replace("http\S+", "")
    db_df["ANS"] = db_df["ANS"].str.replace("http\S+", "")


    #Borramos los tags de las preguntas que estan marcadas  "[Serious], [NSFW], etc" 
    #se puede guardar esta información.
    db_df["Q"] = db_df["Q"].replace(r"^\[.*\]", "", regex=True)
    db_df["Q"] = db_df["Q"].replace(r"[\"\“\”]", "", regex=True)
    db_df["Ans"] = db_df["ANS"].replace(r"[\"\“\”]", "", regex=True)

    #eliminando los emojis
    
    db_df["Q"] = db_df["Q"].replace(emoji_pattern, "")


    #remove stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')        
    db_df["Qless"] = db_df.Q.str.replace(pattern, '')
    db_df["Aless"] = db_df.ANS.str.replace(pattern, '')


        
    #Obtenemos los tokens
    if toks:
        db_df["Qtoks"] = [word_tokenize(w) for w in db_df["Qless"]]
        db_df["Atoks"] = [word_tokenize(w) for w in db_df["Aless"]]

    #Obtenemos los lemmas
    if lems:
        db_df["Qlemmas"] = [' '.join(lmtzr.lemmatize(t) for t in qes) for qes in db_df["Qtoks"]]
        db_df["Alemmas"] = [' '.join(lmtzr.lemmatize(t) for t in qes) for qes in db_df["Atoks"]]


    # Par Tok & POS
    if pos:
        db_df["Qpos"] = [pos_tag(word_tokenize(w)) for w in db_df["Qlemmas"]]
        db_df["Apos"] = [pos_tag(word_tokenize(w)) for w in db_df["Alemmas"]]
        
    # Topic centric tags
    if tags:
        qtags = []
        for ws in list(db_df["Qpos"]):
            this_tags = ""
            for w in ws:
                if w[1] in selected_tags:
                   this_tags += f"{w[0]}_{w[1]} "
            qtags.append(this_tags)
        db_df["Qkeys"] = qtags

        atags = []
        for ws in list(db_df["Apos"]):
            this_tags = ""
            for w in ws:
                if w[1] in selected_tags:
                   this_tags += f"{w[0]}_{w[1]} "
            atags.append(this_tags)
        db_df["Akeys"] = atags



    db_df = db_df[["id", "Q", "Qless", "Qlemmas", "Qkeys", "Qtoks", "ANS", "Aless", "Alemmas", "Akeys", "Atoks"]]

    return db_df
    

#usar pickle no sirve para comprimir naranja
df = extract_features(db_df)
df

  db_df["Q"] = db_df["Q"].str.replace("http\S+", "")
  db_df["ANS"] = db_df["ANS"].str.replace("http\S+", "")


Unnamed: 0,id,Q,Qless,Qlemmas,Qkeys,Qtoks,ANS,Aless,Alemmas,Akeys,Atoks
0,bc60pk,"Donut or doughnut, & where are you from?","Donut doughnut, & ?","Donut doughnut , & ?",Donut_NNP doughnut_NN,"[Donut, doughnut, ,, &, ?]","donut, southern usa","donut, southern usa","donut , southern usa",donut_NN usa_NN,"[donut, ,, southern, usa]"
1,289mmm,Help finding a song from a video description?,Help finding song video description?,Help finding song video description ?,Help_NNP finding_NN song_NN video_NN descripti...,"[Help, finding, song, video, description, ?]",/r/tipofmytongue,/r/tipofmytongue,/r/tipofmytongue,/r/tipofmytongue_NN,[/r/tipofmytongue]
2,rufpr/,"I was rapedNo, we had sex","I rapedNo, sex","I rapedNo , sex",I_PRP rapedNo_VBP sex_NN,"[I, rapedNo, ,, sex]","if its not a yes, its no.","yes, .","yes , .",yes_RB,"[yes, ,, .]"
3,24vtw3,Jesus H. Christ what's his middle name?,Jesus H. Christ 'middle name?,Jesus H. Christ 'middle name ?,Jesus_NNP H._NNP Christ_NNP name_NN,"[Jesus, H., Christ, 'middle, name, ?]",umm... Holy. As in Holy Christ. Isn't this com...,umm... Holy. As Holy Christ. Isn'common knowle...,umm ... Holy . As Holy Christ . Isn'common kno...,umm_NN Holy_NNP As_IN Holy_NNP Christ_NNP Isn'...,"[umm, ..., Holy, ., As, Holy, Christ, ., Isn'c..."
4,sv054/,Judith from hotmail sent me a message..... ),Judith hotmail sent message..... ),Judith hotmail sent message ..... ),Judith_NNP hotmail_NN sent_VBD message_NN .......,"[Judith, hotmail, sent, message, ....., )]",sounds like a great idea!,sounds like great idea!,sound like great idea !,sound_NN like_IN idea_NN,"[sounds, like, great, idea, !]"
...,...,...,...,...,...,...,...,...,...,...,...
90873,orgpc4,"Quiet people, what are your strengths?","Quiet people, strengths?","Quiet people , strength ?",people_NNS strength_NN,"[Quiet, people, ,, strengths, ?]","Observing, attention, memory,","Observing, attention, memory,","Observing , attention , memory ,",Observing_NN attention_NN memory_NN,"[Observing, ,, attention, ,, memory, ,]"
90874,og7m7k,What is the best indian movie?,What best indian movie?,What best indian movie ?,What_WP movie_NN,"[What, best, indian, movie, ?]",Why is this a quote?,Why quote?,Why quote ?,Why_WRB quote_NN,"[Why, quote, ?]"
90875,olnost,"Who's the best MC's, Biggie, Jay-Z, and Nas?","Who'best MC', Biggie, Jay-Z, Nas?","Who'best MC ' , Biggie , Jay-Z , Nas ?",Who'best_NNP MC_NNP Biggie_NNP Jay-Z_NNP Nas_NNP,"[Who'best, MC, ', ,, Biggie, ,, Jay-Z, ,, Nas, ?]",whyd you put your question in quotes?,whyd put question quotes?,whyd put question quote ?,whyd_NN put_VBD question_NN quote_NN,"[whyd, put, question, quotes, ?]"
90876,ova17m,what color next?,color next?,color next ?,color_NN next_IN,"[color, next, ?]",Dm me,Dm,Dm,Dm_NN,[Dm]


In [6]:
df.to_csv("db/features.csv")

# Building the Test Dataset for Clustering Comparition

Small manual clusters to get a little sence if clustering is working

Esto quizas es muy muy tonto, pero solamente ordenar alfabeticamente las preguntas es una buena forma de encontrar similaridad.


In [16]:
import pandas as pd

df = pd.read_csv("db/features.csv", index_col=0)

n_clus = 5 #no cambiar sin agregar otro
n_sam = 100

df = df.groupby("Q", as_index=False).first()

#testeo de algunos "tipos" de preguntas precatios y de concepto
q_ids = []
q_ids += list(df[df.Q.str.contains(r'favorite movie')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'ever seen')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'advice')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'history')].id.sample(n_sam))
q_ids += list(df[df.Q.str.contains(r'book')].id.sample(n_sam))

# Mostramos un poquito como queda
# for j in range(n_clus):
#    print(f"\ncluster {j}")
#    for i in range(n_sam):
#       print(f"   {df.loc[q_ids[j*n_clus+i]].Q}")


q_ids = [(x, int(i/n_sam)) for i, x in enumerate(q_ids)] 

total = n_clus*n_sam

test_db = []
for i in range(total):
    for j in range(1, total-i):
        este = q_ids[i]
        otro = q_ids[j+i]
        test_db += [[este[0], otro[0], este[1] == otro[1]]]


test_db = pd.DataFrame(test_db)
test_db = test_db[test_db[0] != test_db[1]]


In [17]:
test_db

Unnamed: 0,0,1,2
0,oynje9,om5jlt,True
1,oynje9,ojx9n9,True
2,oynje9,oq72gi,True
3,oynje9,ow05dx,True
4,oynje9,oikfa5,True
...,...,...,...
124745,dfgxx/,okua7s,True
124746,dfgxx/,onb2lj,True
124747,dp658/,okua7s,True
124748,dp658/,onb2lj,True


In [18]:
test_db.to_csv("db/test_db.csv")

In [3]:
emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002500-\U00002BEF"  # chinese char
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"  # dingbats
                            u"\u3030"
                            "]+", flags=re.UNICODE)

In [19]:
df.Q

0                      29 year old male stuck in cum mode!
1                         Active community for Source SDK?
2                      Alcoholic brother claims cirrhosis?
3                         An end to No Sexual Topics week.
4                               Announcing our newest tag!
                               ...                        
90779    your username is how you’ll die how will you die?
90780        your username is your username. what happens?
90781      youtubers of reddit.... what is your MCN story?
90782                        ¿Lo que muele tus engranajes?
90783                 ¿What easy work is really difficult?
Name: Q, Length: 90784, dtype: object