In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import json
import re
import string
from collections import Counter

from nltk.corpus import stopwords

In [2]:
def normalize(s):
    replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"))
    for a, b in replacements:
        s = s.lower()
        s = s.replace(a, b)
    return s

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0000270D"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r"", text)

def cleanTxt(text):
    text = re.sub(r"@[a-zA-Z0-9]+", "", text) #Removes @mentions
    text = re.sub(r"#", "", text) #Removing the "#" symbol
    text = re.sub(r"RT[\s]+", "", text) #Removing RT
    text = re.sub(r"https?:\/\/\S+", "", text) #Remove the hyperlink
    return text

def replace_punct(s):
    for i in string.punctuation:
        if i in s:
            s = s.replace(i, "").strip()
    return s

def replace_num(s):
    for i in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
        s = s.replace(i, "")
    return s

def preprocessor(text):
    text = re.sub(r"[\W]+", "", text.lower()) 
    return text

def tokenizador(text):
    important_words = []
    for word in text.split(" "):
        if word not in stopwords.words("spanish"):
            if word != "":
                important_words.append(word)
    return " ".join(important_words).strip()

def foo(text):
    forbidden = ("?", "¿", "¡", "!", ",", ".", ";", ":", "-", "'", "+", "$", "/", "*",'«','»', "~", "(", ")")
    aux = ""
    for v in text:
        if not v in forbidden:
            aux += v
    return aux

def quita_palabras_pequeñas(text):
    return " ".join([word for word in text.split(" ") if len(word) > 2])            

In [3]:
%%time
df = pd.read_csv("C:/Users/Daniel/Desktop/csv/no_trending.csv")
df.drop(df.columns[0], axis = 1, inplace = True)



Wall time: 35.9 s


In [4]:
# Filtro por el dia 24 o 25

FECHA = 24

df = df[df.day == FECHA]
df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)
print(df.shape)

(17332, 17)


In [5]:
# Convierto la columna hashtags a str
df.hashtags = df.hashtags.apply(str)

In [6]:
%%time

# Eliminio las filas que no tengan texto en el tweet

tweet_na = []

for num, tweet in enumerate(df.tweet):
    if type(tweet) != str:
        tweet_na.append(num)


df.drop(tweet_na, inplace = True)

df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)

Wall time: 107 ms


In [7]:
# Cargo las tendencias de ese dia

tendencias = []
with open("C:/Users/Daniel/Desktop/csv/dia 24/trends/dia 24 tendencias.txt", "r") as f:
    tendencias.extend(f.readlines())
    
tendencias = [t[:-1].strip("/t") for num, t in enumerate(tendencias) if num != len(tendencias) - 1][-50:]

df_tendencias = pd.DataFrame(tendencias, columns = ["trends"])
df_tendencias = df_tendencias.trends.unique()
df_tendencias = pd.DataFrame(df_tendencias, columns = ["trends"])
solo_tendencias = list(df_tendencias.trends.unique())

In [8]:
# Lista de palabras tendencias y hashtags tendencias

hashtags_tendencias = [t for t in solo_tendencias if t[0] == "#"]
palabras_tendencias = [t.strip("\t") for t in solo_tendencias if t[0] != "#"]
hashtags_tendencias_sin_numeral = [t.strip("#").lower() for t in solo_tendencias if t[0] == "#"]
palabras_tendencias_lower = [t.strip("\t").lower() for t in solo_tendencias if t[0] != "#"]

print("hashtags_tendencias:", len(hashtags_tendencias))
print("palabras_tendencias:", len(palabras_tendencias))
print("hashtags_tendencias_sin_numeral:", len(hashtags_tendencias_sin_numeral))
print("palabras_tendencias_lower:", len(palabras_tendencias_lower))

hashtags_tendencias: 26
palabras_tendencias: 24
hashtags_tendencias_sin_numeral: 26
palabras_tendencias_lower: 24


# HASHTAGS NO TENDENCIA MAS REPETIDAS

In [9]:
%%time

# Cuanto cuantos hashtags hay en el df y me quedo con los mas repetidos

hashtags_no_tendencias = list()
for h in df.hashtags:
    for hashtag in h.split(","):
        if hashtag not in hashtags_tendencias and hashtag != "nan":
            hashtags_no_tendencias.append(hashtag)
                
hashtags_no_tendencias = Counter(hashtags_no_tendencias).most_common()[:len(hashtags_tendencias_sin_numeral)]
hashtags_no_tendencias = {h[0] : h[1] for h in hashtags_no_tendencias}

print("Numero de hashtasg no tendencia:", len(hashtags_no_tendencias))

Numero de hashtasg no tendencia: 26
Wall time: 85 ms


In [10]:
%%time

# Saco los indices de las filas que tengan hashtags tendencias

hashtags_indices = []
for num, h in enumerate(df.hashtags):
    for hashtag in h.split(","):
        if hashtag.lower() in hashtags_tendencias_sin_numeral:
            hashtags_indices.append(num)
                
print("Cantidad de tweets con hashtags tendencias:", len(hashtags_indices))

df.drop(hashtags_indices, inplace = True)

df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)

Cantidad de tweets con hashtags tendencias: 40
Wall time: 151 ms


# PALABRAS CLAVES NO TENDENCIA MAS REPETIDAS

In [11]:
%%time

# Voy a quitar los tweets que tengan palabras claves tendencias

palabras_indices = []

for num, tweet in enumerate(df.tweet):
    for palabra in palabras_tendencias_lower:
        if tweet.lower().find(palabra) != -1:
            palabras_indices.append(num)

print(len(palabras_indices))
            
df.drop(palabras_indices, inplace=True)

df = df.reset_index().drop(df.columns[0], axis = 1)

60
Wall time: 1.72 s


In [12]:
%%time

# Ahora voy a limpiar los tweets, para poder ver que palabras claves no tendencia se repiten mas

df.tweet = df.tweet.apply(normalize)
df.tweet = df.tweet.apply(deEmojify)
df.tweet = df.tweet.apply(cleanTxt)
df.tweet = df.tweet.apply(replace_punct)
df.tweet = df.tweet.apply(replace_num)
df.tweet = df.tweet.apply(quita_palabras_pequeñas)

df.tweet = df.tweet.apply(tokenizador)
df.tweet = df.tweet.apply(foo)

Wall time: 5min 26s


In [13]:
%%time
# Dropeo las filas de tweets que tengan texto ""

tweet_vacios = []

for num, tweet in enumerate(df.tweet):
    if tweet == "":
        tweet_vacios.append(num)

print(len(tweet_vacios))        

df.drop(tweet_vacios, inplace = True)

df = df.reset_index()
df.drop(df.columns[0], axis = 1, inplace = True)

965
Wall time: 104 ms


In [14]:
%%time

# Cuanto cuantos palabras hay en el df y me quedo con los mas repetidos

palabras_no_tendencias = list()
for p in df.tweet:
    for palabra in p.split(" "):
        palabras_no_tendencias.append(palabra)
            
palabras_no_tendencias = Counter(palabras_no_tendencias).most_common()[:len(palabras_tendencias)]
palabras_no_tendencias = {h[0] : h[1] for h in palabras_no_tendencias}

print(len(palabras_no_tendencias))

24
Wall time: 276 ms


# AHORA SEPARAMOS EL DF: LOS QUE TIENE HASHTAGS Y CON LOS QUE NO

# DF_H (df de los hashtags)

In [15]:
df_h = df[df.hashtags != "nan"]

In [16]:
df_h.drop(df_h.columns[0], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
df_h = df_h.reset_index()
df_h.drop(df_h.columns[0], axis = 1, inplace = True)

In [18]:
df_h["trends"] = [[h if h in hashtags_no_tendencias else 0 for h in hashtag.split(",")] for hashtag in df_h.hashtags]

In [19]:
indices_drop = list()
for num, t in enumerate(df_h.trends):
    if 0 in t:
        indices_drop.append(num)
        
df_h.drop(indices_drop, inplace = True)

df_h = df_h.reset_index()
df_h.drop(df_h.columns[0], axis = 1, inplace = True)

In [20]:
indices_para_clonar = list()
for num, t in enumerate(df_h.trends):
    if len(t) > 1:
        indices_para_clonar.append(num)

In [21]:
dic_indices = {indice : [len(trends), trends] for indice, trends in zip(indices_para_clonar, df_h.loc[indices_para_clonar].trends)}

df_v = pd.DataFrame(columns = df_h.columns)

for key in dic_indices.keys():
    for time in range(dic_indices[key][0]):
        df_d = pd.DataFrame(df_h.loc[key]).T
        df_d.drop(df_d.columns[-1], axis = 1, inplace = True)
        df_d["trends"] = dic_indices[key][1][time]
        df_v = pd.concat([df_v, df_d])

In [22]:
df_h.drop(indices_para_clonar, inplace = True)

df_h = df_h.reset_index()
df_h.drop(df_h.columns[0], axis = 1, inplace = True)

In [23]:
df_h.trends = df_h.trends.apply(lambda x : x[0]) 

In [24]:
df_h = pd.concat([df_h, df_v])
df_h

Unnamed: 0,time,username,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,retweet,video,reply_to,hours,day,month,trends
0,23:50:56,nanoflores75,yosoydelosmillonesdemisterchip datos puros dur...,,0,0,0,0,0,yosoydelos3millonesdemisterchip,0,0,,23,24,2,yosoydelos3millonesdemisterchip
1,23:50:48,miguelsvl,seguir viendo datos reirme contestaciones das ...,,0,0,0,0,0,yosoydelos3millonesdemisterchip,0,0,,23,24,2,yosoydelos3millonesdemisterchip
2,23:50:02,justseero,datos yosoydelosmillonesdemisterchip,,0,0,0,0,0,yosoydelos3millonesdemisterchip,0,0,2010MisterChip,23,24,2,yosoydelos3millonesdemisterchip
3,23:48:01,pisanijavier,yosoydelosmillonesdemisterchip informacion pod...,,0,0,0,0,0,yosoydelos3millonesdemisterchip,0,0,,23,24,2,yosoydelos3millonesdemisterchip
4,23:46:38,fcolopezm_,coronavirus españa positivo canarias cuatro ca...,,1,0,0,0,0,coronavirus,0,0,,23,24,2,coronavirus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,01:39:06,ostwaldguillen,maialen favorita maialen favorita maialen favo...,,0,1,2,33,139,"otgala6,otchat6",0,1,,1,24,2,otchat6
217,01:34:27,saramatarile,parece frivolo dejeis darle abrazo intimidad p...,"soyanneot2020,ot_oficial",0,0,0,1,2,"otgala6,otchat6",0,0,,1,24,2,otgala6
217,01:34:27,saramatarile,parece frivolo dejeis darle abrazo intimidad p...,"soyanneot2020,ot_oficial",0,0,0,1,2,"otgala6,otchat6",0,0,,1,24,2,otchat6
219,01:28:51,carmen_abcdd,gerard anne voy olvidar cuanto salga primero v...,,0,0,1,6,20,"otgala6,otchat6",0,0,,1,24,2,otgala6


# DF_P (df de las palabras claves)

In [25]:
df_p = df[df.hashtags == "nan"]
df_p.drop(df_p.columns[0], axis = 1, inplace = True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [26]:
df_p = df_p.reset_index()
df_p.drop(df_p.columns[0], axis = 1, inplace = True)

In [27]:
df_p["trends"] = [[p for p in palabra.split(" ") if p in palabras_no_tendencias] for palabra in df_p.tweet]

In [28]:
indices_drop = list()
for num, trend in enumerate(df_p.trends):
    if trend == []:
        indices_drop.append(num)
        
df_p.drop(indices_drop, inplace = True)

df_p = df_p.reset_index()
df_p.drop(df_p.columns[0], axis = 1, inplace = True)

In [29]:
df_p

Unnamed: 0,time,username,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,retweet,video,reply_to,hours,day,month,trends
0,23:59:56,nuriafr16,lightmyfire ver,,0.0,0,0.0,0.0,1.0,,0.0,0.0,__Lightmyfire,23.0,24.0,2.0,[ver]
1,23:59:52,inigojouron,dios infectemos madrid mueran volvemos llevarn...,,0.0,0,1.0,0.0,1.0,,0.0,0.0,MMissingx,23.0,24.0,2.0,[madrid]
2,23:59:31,diecinuevelu,quiero mas ser exploto jodernos vida nuevo mal...,,0.0,0,1.0,1.0,1.0,,0.0,0.0,,23.0,24.0,2.0,"[mas, ser]"
3,23:59:28,old_atarian,viendo serie “sabrina” gratamente sorprendido ...,netflixes,0.0,0,0.0,0.0,1.0,,0.0,0.0,,23.0,24.0,2.0,[tambien]
4,23:59:27,_airwin_,siempre placer volver auditorio navional mucha...,fundscherzo,0.0,1,0.0,1.0,4.0,,0.0,1.0,,23.0,24.0,2.0,"[siempre, gracias]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4781,01:00:45,mrhearoh,noemi terrorista coño haces salva alguien dios...,,0.0,0,1.0,0.0,1.0,,0.0,0.0,,1.0,24.0,2.0,[hace]
4782,01:00:44,yatzirylunaa,doing homework while studying abroad such slap...,,0.0,0,1.0,1.0,13.0,,0.0,0.0,,1.0,24.0,2.0,[the]
4783,01:00:26,martianthen,creo estrategia gente vote mas,,0.0,0,0.0,0.0,0.0,,0.0,0.0,,1.0,24.0,2.0,"[gente, mas]"
4784,01:00:13,alejandroxx__,ver entiende alguien vezzzz,,0.0,0,0.0,0.0,0.0,,0.0,0.0,,1.0,24.0,2.0,[ver]


In [30]:
indices_para_clonar = list()
for num, t in enumerate(df_p.trends):
    if len(t) > 1:
        indices_para_clonar.append(num)

In [None]:
dic_indices = {indice : [len(trends), trends] for indice, trends in zip(indices_para_clonar, df_p.loc[indices_para_clonar].trends)}

df_v = pd.DataFrame(columns = df_p.columns)

for key in dic_indices.keys():
    for time in range(dic_indices[key][0]):
        df_d = pd.DataFrame(df_p.loc[key]).T
        df_d.drop(df_d.columns[-1], axis = 1, inplace = True)
        df_d["trends"] = dic_indices[key][1][time]
        df_v = pd.concat([df_v, df_d])

In [None]:
df_p.drop(indices_para_clonar, inplace = True)

df_p = df_p.reset_index()
df_p.drop(df_p.columns[0], axis = 1, inplace = True)

In [None]:
df_p.trends = df_p.trends.apply(lambda x : x[0]) 

In [None]:
df_p = pd.concat([df_p, df_v])
df_p

In [None]:
df_procesado = pd.concat([df_h, df_p])
df_procesado.to_csv("tweet_{}_notendencia_preprocesado.csv".format(FECHA), sep = ";", index = False)