In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import json
import re
import string
from collections import Counter

from nltk.corpus import stopwords

In [28]:
def to_dict(string):
    if string != "[]":
        string = json.loads(string.replace("'", "\""))
        return ",".join([s["screen_name"] for s in string])
    return ""

def to_list(list_):
    if list_ != "[]":
        list_ = list_[1:-1]
        list_ = list_.split(",")
        return ",".join([s.strip().strip("'") for s in list_])
    return ""

def normalize(s):
    replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"))
    for a, b in replacements:
        s = s.lower()
        s = s.replace(a, b)
    return s

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0000270D"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r"", text)

def cleanTxt(text):
    text = re.sub(r"@[a-zA-Z0-9]+", "", text) #Removes @mentions
    text = re.sub(r"#", "", text) #Removing the "#" symbol
    text = re.sub(r"RT[\s]+", "", text) #Removing RT
    text = re.sub(r"https?:\/\/\S+", "", text) #Remove the hyperlink
    return text

def replace_punct(s):
    for i in string.punctuation:
        if i in s:
            s = s.replace(i, "").strip()
    return s

def replace_num(s):
    for i in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
        s = s.replace(i, "")
    return s

def preprocessor(text):
    text = re.sub(r"[\W]+", "", text.lower()) 
    return text

def tokenizador(text):
    important_words = []
    for word in text.split(" "):
        if word not in stopwords.words("spanish"):
            if word != "":
                important_words.append(word)
    return " ".join(important_words).strip()

def foo(text):
    forbidden = ("?", "¿", "¡", "!", ",", ".", ";", ":", "-", "'", "+", "$", "/", "*",'«','»', "~", "(", ")")
    aux = ""
    for v in text:
        if not v in forbidden:
            aux += v
    return aux

def quita_palabras_pequeñas(text):
    return " ".join([word for word in text.split(" ") if len(word) > 2])            

In [29]:
%%time
df = pd.read_csv("24_notrending.csv")
df.drop(df.columns[0], axis = 1, inplace = True)

Wall time: 6.33 s


In [57]:
columns_to_drop = ["conversation_id", "cashtags", "timezone", "user_id", "name", "near", "geo", "source",
                   "user_rt_id", "user_rt", "retweet_id", "retweet_date", "translate", "trans_src",
                   "trans_dest", "place", "quote_url", "thumbnail", "created_at", "id", "link"]

df.drop(columns_to_drop, axis = 1, inplace = True)

df = df[df.language == "es"]

df.drop("language", axis = 1, inplace = True)

df = df.reset_index().drop("index", axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [31]:
reply_to_rows = []
for num, row in enumerate(df.reply_to):
    try:
        to_dict(row)
    except:
        reply_to_rows.append(num)
        
df.drop(reply_to_rows, inplace = True)

df.reply_to = df.reply_to.apply(to_dict)

df = df.reset_index().drop("index", axis = 1)

In [32]:
mention_rows = []
for num, row in enumerate(df.mentions):
    try:
        to_dict(row)
    except:
        mention_rows.append(num)
        
df.drop(mention_rows, inplace = True)

df.mentions = df.mentions.apply(to_dict)

df = df.reset_index().drop("index", axis = 1)

In [33]:
hashtags_rows = []
for num, row in enumerate(df.hashtags):
    try:
        to_list(row)
    except:
        hashtags_rows.append(num)
        
df.drop(hashtags_rows, inplace = True)

df.hashtags = df.hashtags.apply(to_list)

df = df.reset_index().drop("index", axis = 1)

In [34]:
df.photos = df.photos.apply(lambda x : 1 if x != "[]" else 0)
df.retweet = df.retweet.apply(lambda x : 1 if x == "True" else 0)
df.urls = df.urls.apply(lambda x : 1 if x != "[]" else 0)

In [35]:
df["month"] = df.date.apply(lambda x : x[5 : 7])
df["day"] = df.date.apply(lambda x : x[-2:])

df["hour"] = df.time.apply(lambda x : x[:2])
df["minute"] = df.time.apply(lambda x : x[3:5])
df["second"] = df.time.apply(lambda x : x[6:])

In [36]:
df["mentions_count"] = [len(mention.split(",")) if type(mention) == str else 0 for mention in df.mentions]

df["reply_to_count"] = [len(reply.split(",")) if type(reply) == str else 0 for reply in df.reply_to]

df["hashtags_count"] =  [len(hashtag.split(",")) if type(hashtag) == str else 0 for hashtag in df.hashtags]

df["interaccion"] = [rt + re + lk for rt, re, lk in zip(df.retweets_count, df.replies_count, df.likes_count)]

In [37]:
indices_todrop = list()
for num, time in enumerate(df.time):
    if type(time) != str:
        indices_todrop.append(num)
        
df.drop(indices_todrop, inplace = True)

df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)

In [38]:
# Filtro por el dia 24 o 25

FECHA = '24'

df = df[df.day == FECHA]
df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)

print(df.shape)

(813204, 46)


In [39]:
%%time

# Eliminio las filas que no tengan texto en el tweet

tweet_na = []

for num, tweet in enumerate(df.tweet):
    if type(tweet) != str:
        tweet_na.append(num)


df.drop(tweet_na, inplace = True)

df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)

Wall time: 1.44 s


In [47]:
# Cargo las tendencias de ese dia

tendencias = []
with open("dia 24 tendencias.txt", "r") as f:
    tendencias.extend(f.readlines())
    
tendencias = [t[:-1].strip("/t") for num, t in enumerate(tendencias) if num != len(tendencias) - 1]

df_tendencias = pd.DataFrame(tendencias, columns = ["trends"])
df_tendencias = df_tendencias.trends.unique()
df_tendencias = pd.DataFrame(df_tendencias, columns = ["trends"])
solo_tendencias = list(df_tendencias.trends.unique())

In [48]:
# Lista de palabras tendencias y hashtags tendencias

hashtags_tendencias = [t for t in solo_tendencias if t[0] == "#"]
palabras_tendencias = [t.strip("\t") for t in solo_tendencias if t[0] != "#"]
hashtags_tendencias_sin_numeral = [t.strip("#").lower() for t in solo_tendencias if t[0] == "#"]
palabras_tendencias_lower = [t.strip("\t").lower() for t in solo_tendencias if t[0] != "#"]

print("hashtags_tendencias:", len(hashtags_tendencias))
print("palabras_tendencias:", len(palabras_tendencias))

print("hashtags_tendencias_sin_numeral:", len(hashtags_tendencias_sin_numeral))
print("palabras_tendencias_lower:", len(palabras_tendencias_lower))

hashtags_tendencias: 148
palabras_tendencias: 159
hashtags_tendencias_sin_numeral: 148
palabras_tendencias_lower: 159


# HASHTAGS NO-TENDENCIA MAS REPETIDAS

In [49]:
%%time

# Cuento cuantos hashtags hay en el df y me quedo con los mas repetidos

hashtags_no_tendencias = list()
for h in df.hashtags:
    for hashtag in h.split(","):
        if hashtag not in hashtags_tendencias and hashtag != "nan":
            hashtags_no_tendencias.append(hashtag)
                
hashtags_no_tendencias = Counter(hashtags_no_tendencias).most_common()[:len(hashtags_tendencias_sin_numeral)]
hashtags_no_tendencias = {h[0] : h[1] for h in hashtags_no_tendencias}

print("Numero de hashtasg no tendencia:", len(hashtags_no_tendencias))

Numero de hashtasg no tendencia: 148
Wall time: 1.63 s


In [50]:
%%time

# Saco los indices de las filas que tengan hashtags tendencias

hashtags_indices = []
for num, h in enumerate(df.hashtags):
    for hashtag in h.split(","):
        if hashtag.lower() in hashtags_tendencias_sin_numeral:
            hashtags_indices.append(num)
                
print("Cantidad de tweets con hashtags tendencias:", len(hashtags_indices))

df.drop(hashtags_indices, inplace = True)

df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)

Cantidad de tweets con hashtags tendencias: 11134
Wall time: 2.59 s


# PALABRAS CLAVES NO-TENDENCIA MAS REPETIDAS

In [51]:
%%time

# Voy a quitar los tweets que tengan palabras claves tendencias

palabras_indices = []

for num, tweet in enumerate(df.tweet):
    for palabra in palabras_tendencias_lower:
        if tweet.lower().find(palabra) != -1:
            palabras_indices.append(num)

print(len(palabras_indices))
            
df.drop(palabras_indices, inplace=True)

df = df.reset_index().drop(df.columns[0], axis = 1)

109505
Wall time: 2min 16s


In [None]:
%%time

# Ahora voy a limpiar los tweets, para poder ver que palabras claves no tendencia se repiten mas

df.tweet = df.tweet.apply(normalize)
df.tweet = df.tweet.apply(deEmojify)
df.tweet = df.tweet.apply(cleanTxt)
df.tweet = df.tweet.apply(replace_punct)
df.tweet = df.tweet.apply(replace_num)
df.tweet = df.tweet.apply(quita_palabras_pequeñas)

df.tweet = df.tweet.apply(tokenizador)
df.tweet = df.tweet.apply(foo)

In [53]:
%%time
# Dropeo las filas de tweets que tengan texto ""

tweet_vacios = []

for num, tweet in enumerate(df.tweet):
    if tweet == "":
        tweet_vacios.append(num)

print(len(tweet_vacios))        

df.drop(tweet_vacios, inplace = True)

df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)

0
Wall time: 1.19 s


In [54]:
%%time

# Cuanto cuantos palabras hay en el df y me quedo con los mas repetidos

palabras_no_tendencias = list()
for p in df.tweet:
    for palabra in p.split(" "):
        palabras_no_tendencias.append(palabra)
            
palabras_no_tendencias = Counter(palabras_no_tendencias).most_common()[:len(palabras_tendencias)]
palabras_no_tendencias = {h[0] : h[1] for h in palabras_no_tendencias}

print(len(palabras_no_tendencias))

159
Wall time: 7.25 s


In [62]:
df.hashtags = df.hashtags.apply(str)

Unnamed: 0,level_0,date,time,username,tweet,mentions,urls,photos,replies_count,retweets_count,...,reply_to,month,day,hour,minute,second,mentions_count,reply_to_count,hashtags_count,interaccion
0,0,2020-02-24,23:59:59,germanlorenz21,"@diegojokas Que hdp que sos Jokas,jajaja.Gol e...",,0,0,0.0,0.0,...,diegojokas,02,24,23,59,59,1,1,1,0.0
1,2,2020-02-24,23:59:59,ruddypujolscasa,@MeltonPinedaF Y el pueblo le entró al PLD que...,,0,0,0.0,0.0,...,MeltonPinedaF,02,24,23,59,59,1,1,1,2.0
2,3,2020-02-24,23:59:59,carlosh29,@CalleLaJeta_ llegan y los sacan en carro de ...,,0,0,1.0,0.0,...,CalleLaJeta_,02,24,23,59,59,1,1,1,4.0
3,4,2020-02-24,23:59:59,fer_ortiz17,Sabían que se debe llamar realmente mejor amig...,,0,0,1.0,0.0,...,,02,24,23,59,59,1,1,1,5.0
4,6,2020-02-24,23:59:59,matmanchile,@unmishiprr @marianelanelaff @vilm_valle @coni...,,0,0,0.0,0.0,...,"marianelanelaff,vilm_valle,conilewin",02,24,23,59,59,1,3,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655688,701798,2020-02-24,01:00:17,milenialeyva,Esta noche estoy más enganchada a los comentar...,,0,0,5.0,1.0,...,,02,24,01,00,17,1,1,1,69.0
655689,701799,2020-02-24,01:00:13,alejandroxx__,A ver si le entiende alguien d una vezzzz,,0,0,0.0,0.0,...,,02,24,01,00,13,1,1,1,0.0
655690,701801,2020-02-24,01:00:10,pantxisko,@sheepols @mnicolas83 chaval es víctima de bul...,,0,0,0.0,0.0,...,mnicolas83,02,24,01,00,10,1,1,1,0.0
655691,701804,2020-02-24,01:00:03,relojjarando,🕐 Talón,,0,0,1.0,0.0,...,,02,24,01,00,03,1,1,1,1.0


# AHORA SEPARAMOS EL DF: LOS QUE TIENE HASHTAGS Y CON LOS QUE NO

# DF_H (df de los hashtags)

In [64]:
df_h = df[df.hashtags != ""]

In [65]:
df_h.drop(df_h.columns[0], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [66]:
df_h = df_h.reset_index()
df_h.drop(df_h.columns[0], axis = 1, inplace = True)

In [67]:
df_h["trends"] = [[h if h in hashtags_no_tendencias else 0 for h in hashtag.split(",")] for hashtag in df_h.hashtags]

In [80]:
indices_drop = list()
for num, t in enumerate(df_h.trends):
    if 0 in t:
        indices_drop.append(num)
        
df_h.drop(indices_drop, inplace = True)

df_h = df_h.reset_index()
df_h.drop(df_h.columns[0], axis = 1, inplace = True)

In [81]:
indices_para_clonar = list()
for num, t in enumerate(df_h.trends):
    if len(t) > 1:
        indices_para_clonar.append(num)

In [82]:
dic_indices = {indice : [len(trends), trends] for indice, trends in zip(indices_para_clonar, df_h.loc[indices_para_clonar].trends)}

df_v = pd.DataFrame(columns = df_h.columns)

for key in dic_indices.keys():
    for time in range(dic_indices[key][0]):
        df_d = pd.DataFrame(df_h.loc[key]).T
        df_d.drop(df_d.columns[-1], axis = 1, inplace = True)
        df_d["trends"] = dic_indices[key][1][time]
        df_v = pd.concat([df_v, df_d])

In [83]:
df_h.drop(indices_para_clonar, inplace = True)

df_h = df_h.reset_index()
df_h.drop(df_h.columns[0], axis = 1, inplace = True)

In [84]:
df_h.trends = df_h.trends.apply(lambda x : x[0]) 

In [85]:
df_h = pd.concat([df_h, df_v])
df_h

Unnamed: 0,date,time,username,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,...,month,day,hour,minute,second,mentions_count,reply_to_count,hashtags_count,interaccion,trends
0,2020-02-24,23:59:53,enfermera_soy,Me pregunto tu interés numérico sobre las muer...,,0,0,0,8,14,...,02,24,23,59,53,1,1,1,22,coronavirus
1,2020-02-24,23:59:52,dralo_,@2010MisterChip #YoSoyDeLos3MillonesDeMisterCh...,twelvemer12,0,0,0,0,0,...,02,24,23,59,52,1,1,1,0,yosoydelos3millonesdemisterchip
2,2020-02-24,23:59:37,marcelo_urbanor,"Llevamos 5 fechas, y ya estamos a 12 puntos de...",,0,0,0,0,0,...,02,24,23,59,37,1,1,1,0,vamoscolocolo
3,2020-02-24,23:59:36,marcelovic96,#VamosColoColo él es el hombre https://t.co/A...,,0,1,3,0,1,...,02,24,23,59,36,1,1,1,4,vamoscolocolo
4,2020-02-24,23:59:29,jmolinapelayo,Cada partido de #ColoColo se juega peor que el...,,0,0,0,0,0,...,02,24,23,59,29,1,1,1,0,colocolo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12498,2020-02-24,09:02:11,equo,"España, líder europeo en infracciones ambienta...",,1,0,0,35,28,...,02,24,09,02,11,1,1,2,63,barcelona
12504,2020-02-24,04:35:21,fashionbysutton,#madrid Algunos consejos de seguridad contra e...,,1,0,0,0,0,...,02,24,04,35,21,1,1,2,0,madrid
12504,2020-02-24,04:35:21,fashionbysutton,#madrid Algunos consejos de seguridad contra e...,,1,0,0,0,0,...,02,24,04,35,21,1,1,2,0,coronavirus
12506,2020-02-24,02:51:46,geniotweets,"No hay que ser "" Pitágoras "" para saber que Pa...",,0,0,2,4,4,...,02,24,02,51,46,1,1,2,10,coronavirus


# DF_P (df de las palabras claves)

In [87]:
df_p = df[df.hashtags == ""]
df_p.drop(df_p.columns[0], axis = 1, inplace = True) 

In [88]:
df_p = df_p.reset_index()
df_p.drop(df_p.columns[0], axis = 1, inplace = True)

In [89]:
df_p["trends"] = [[p for p in palabra.split(" ") if p in palabras_no_tendencias] for palabra in df_p.tweet]

In [90]:
indices_drop = list()
for num, trend in enumerate(df_p.trends):
    if trend == []:
        indices_drop.append(num)
        
df_p.drop(indices_drop, inplace = True)

df_p = df_p.reset_index()
df_p.drop(df_p.columns[0], axis = 1, inplace = True)

In [91]:
df_p

Unnamed: 0,date,time,username,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,...,month,day,hour,minute,second,mentions_count,reply_to_count,hashtags_count,interaccion,trends
0,2020-02-24,23:59:59,germanlorenz21,"@diegojokas Que hdp que sos Jokas,jajaja.Gol e...",,0,0,0.0,0.0,0.0,...,02,24,23,59,59,1,1,1,0.0,"[Que, que, en, , el, no, un]"
1,2020-02-24,23:59:59,ruddypujolscasa,@MeltonPinedaF Y el pueblo le entró al PLD que...,,0,0,0.0,0.0,2.0,...,02,24,23,59,59,1,1,1,2.0,"[Y, el, le, al, que, e, que, van]"
2,2020-02-24,23:59:59,carlosh29,@CalleLaJeta_ llegan y los sacan en carro de ...,,0,0,1.0,0.0,3.0,...,02,24,23,59,59,1,1,1,4.0,"[y, los, , en, de, y, , de, , desde, el]"
3,2020-02-24,23:59:59,fer_ortiz17,Sabían que se debe llamar realmente mejor amig...,,0,0,1.0,0.0,4.0,...,02,24,23,59,59,1,1,1,5.0,"[que, se, mejor, a, quien, a, sin, y, a, , no,..."
4,2020-02-24,23:59:59,matmanchile,@unmishiprr @marianelanelaff @vilm_valle @coni...,,0,0,0.0,0.0,1.0,...,02,24,23,59,59,1,3,1,1.0,"[de, al, para, la, a, la, a, bien, con]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569963,2020-02-24,01:00:26,martianthen,Yo creo que es una estrategia para que la gent...,,0,0,0.0,0.0,0.0,...,02,24,01,00,26,1,1,1,0.0,"[Yo, creo, que, es, una, para, que, la, gente,..."
569964,2020-02-24,01:00:13,alejandroxx__,A ver si le entiende alguien d una vezzzz,,0,0,0.0,0.0,0.0,...,02,24,01,00,13,1,1,1,0.0,"[A, ver, si, le, una]"
569965,2020-02-24,01:00:10,pantxisko,@sheepols @mnicolas83 chaval es víctima de bul...,,0,0,0.0,0.0,0.0,...,02,24,01,00,10,1,1,1,0.0,"[es, de, y, así, de, una, No, esa, de, al, me,..."
569966,2020-02-24,01:00:03,relojjarando,🕐 Talón,,0,0,1.0,0.0,0.0,...,02,24,01,00,03,1,1,1,1.0,[]


In [92]:
indices_para_clonar = list()
for num, t in enumerate(df_p.trends):
    if len(t) > 1:
        indices_para_clonar.append(num)

In [None]:
dic_indices = {indice : [len(trends), trends] for indice, trends in zip(indices_para_clonar, df_p.loc[indices_para_clonar].trends)}

df_v = pd.DataFrame(columns = df_p.columns)

for key in dic_indices.keys():
    for time in range(dic_indices[key][0]):
        df_d = pd.DataFrame(df_p.loc[key]).T
        df_d.drop(df_d.columns[-1], axis = 1, inplace = True)
        df_d["trends"] = dic_indices[key][1][time]
        df_v = pd.concat([df_v, df_d])

In [None]:
df_p.drop(indices_para_clonar, inplace = True)

df_p = df_p.reset_index()
df_p.drop(df_p.columns[0], axis = 1, inplace = True)

In [None]:
df_p.trends = df_p.trends.apply(lambda x : x[0]) 

In [None]:
df_p = pd.concat([df_p, df_v])
df_p

In [None]:
df_procesado = pd.concat([df_h, df_p])
df_procesado.to_csv("tweet_{}_notendencia_preprocesado.csv".format(FECHA), sep = ";", index = False)