In [1]:
import numpy as np
import pandas as pd
import json
import re
import string

from nltk.corpus import stopwords

In [2]:
def to_dict(string):
    if string != "[]":
        string = json.loads(string.replace("'", "\""))
        return ",".join([s["screen_name"] for s in string])
    return ""

def to_list(list_):
    if list_ != "[]":
        list_ = list_[1:-1]
        list_ = list_.split(",")
        return ",".join([s.strip().strip("'") for s in list_])
    return ""

def normalize(s):
    replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"))
    for a, b in replacements:
        s = s.lower()
        s = s.replace(a, b)
    return s

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r"", text)

def cleanTxt(text):
    text = re.sub(r"@[a-zA-Z0-9]+", "", text) #Removes @mentions
    text = re.sub(r"#", "", text) #Removing the "#" symbol
    text = re.sub(r"RT[\s]+", "", text) #Removing RT
    text = re.sub(r"https?:\/\/\S+", "", text) #Remove the hyperlink
    return text

def replace_punct(s):
    for i in string.punctuation:
        if i in s:
            s = s.replace(i, "").strip()
    return s

def replace_num(s):
    for i in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
        s = s.replace(i, "")
    return s

def tokenizador(text):
    important_words = []
    for word in text.split(" "):
        if word not in stopwords.words("spanish"):
            if word != "":
                important_words.append(word)
    return " ".join(important_words).strip()

def foo(text):
    forbidden = ("?", "¿", "¡", "!", ",", ".", ";", ":", "-", "'", "+", "$", "/", "*",'«','»', "~", "(", ")")
    aux = ""
    for v in text:
        if not v in forbidden:
            aux += v
    return aux

def quita_palabras_pequeñas(text):
    return " ".join([word for word in text.split(" ") if len(word) > 2])  

In [3]:
%%time
df = pd.read_csv("C:/Users/Daniel/Desktop/csv/dia 24/trends/tweets_tendencias_24.csv")
df.head()



Wall time: 53.8 s


Unnamed: 0.2,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,...,reply_to,retweet_date,translate,trans_src,trans_dest,trend,Unnamed: 0.1,hours,day,month
0,0,1.231919e+18,1.231919e+18,2020-02-24 13:28:15 Romance Standard Time,2020-02-24,13:28:15,100.0,548719700.0,djurekbl,Marko Djurek,...,[],,,,,#AtletiVillareal,,,,
1,1,1.231871e+18,1.231871e+18,2020-02-24 10:18:31 Romance Standard Time,2020-02-24,10:18:31,100.0,7.882786e+17,indioscaledonia,Indios de Caledonia,...,[],,,,,#AtletiVillareal,,,,
2,2,1.232062e+18,1.16248e+18,2020-02-24 22:56:17 Romance Standard Time,2020-02-24,22:56:17,100.0,8.602287e+17,dataafaok,Data Afa (desde 🏡),...,[],,,,,#AtletiVillarreal,,,,
3,3,1.232044e+18,1.232044e+18,2020-02-24 21:46:20 Romance Standard Time,2020-02-24,21:46:20,100.0,8.434539e+17,mas_que_pelotas,Más Que Pelotas®️,...,[],,,,,#AtletiVillarreal,,,,
4,4,1.232044e+18,1.232044e+18,2020-02-24 21:45:00 Romance Standard Time,2020-02-24,21:45:00,100.0,8.434539e+17,mas_que_pelotas,Más Que Pelotas®️,...,[],,,,,#AtletiVillarreal,,,,


In [4]:
df.drop(df.columns[0], axis = 1, inplace = True)
df.drop(df.columns[-4], axis = 1, inplace = True)
df.drop("hours", axis = 1, inplace = True)

In [5]:
columns_to_drop = ["conversation_id", "cashtags", "timezone", "user_id", "name", "near", "geo", "source",
                   "user_rt_id", "user_rt", "retweet_id", "retweet_date", "translate", "trans_src",
                   "trans_dest", "place", "quote_url", "thumbnail", "created_at", "id", "link"]

df.drop(columns_to_drop, axis = 1, inplace = True)

df = df[df.language == "es"]

df.drop("language", axis = 1, inplace = True)

df = df.reset_index().drop("index", axis = 1)

#con esto borramos algunas más que se quedan vacías
#df.drop(df.columns[df.isna().any()].tolist(), axis = 1, inplace = True)

In [6]:
reply_to_rows = []
for num, row in enumerate(df.reply_to):
    try:
        to_dict(row)
    except:
        reply_to_rows.append(num)
        
df.drop(reply_to_rows, inplace = True)

df.reply_to = df.reply_to.apply(to_dict)

df = df.reset_index().drop("index", axis = 1)

In [7]:
mention_rows = []
for num, row in enumerate(df.mentions):
    try:
        to_dict(row)
    except:
        mention_rows.append(num)
        
df.drop(mention_rows, inplace = True)

df.mentions = df.mentions.apply(to_dict)

df = df.reset_index().drop("index", axis = 1)

In [8]:
hashtags_rows = []
for num, row in enumerate(df.hashtags):
    try:
        to_list(row)
    except:
        hashtags_rows.append(num)
        
df.drop(hashtags_rows, inplace = True)

df.hashtags = df.hashtags.apply(to_list)

df = df.reset_index().drop("index", axis = 1)

In [9]:
df.photos = df.photos.apply(lambda x : 1 if x != "[]" else 0)
df.retweet = df.retweet.apply(lambda x : 1 if x == "True" else 0)
df.urls = df.urls.apply(lambda x : 1 if x != "[]" else 0)

In [10]:
#%%time
#df.date = df.date.apply(lambda x : datetime.strptime(x, "%Y-%m-%d"))

In [11]:
#%%time
#df.time = df.time.apply(lambda x : datetime.strptime(x, "%H:%M:%S"))

In [12]:
%%time
df.tweet = df.tweet.apply(normalize)
df.tweet = df.tweet.apply(deEmojify)
df.tweet = df.tweet.apply(cleanTxt)
df.tweet = df.tweet.apply(replace_punct)
df.tweet = df.tweet.apply(replace_num)

df.tweet = df.tweet.apply(tokenizador)
df.tweet = df.tweet.apply(foo)
df.tweet = df.tweet.apply(quita_palabras_pequeñas)

Wall time: 2h 26min 17s


In [13]:
df["month"] = df.date.apply(lambda x : x[5 : 7])
df["day"] = df.date.apply(lambda x : x[-2:])

df["hour"] = df.time.apply(lambda x : x[:2])
df["minute"] = df.time.apply(lambda x : x[3:5])
df["second"] = df.time.apply(lambda x : x[6:])

In [14]:
df["mentions_count"] = [len(mention.split(",")) if type(mention) == str else 0 for mention in df.mentions]

df["reply_to_count"] = [len(reply.split(",")) if type(reply) == str else 0 for reply in df.reply_to]

df["hashtags_count"] =  [len(hashtag.split(",")) if type(hashtag) == str else 0 for hashtag in df.hashtags]

df["interaccion"] = [rt + re + lk for rt, re, lk in zip(df.retweets_count, df.replies_count, df.likes_count)]

In [16]:
df.head()

Unnamed: 0,date,time,username,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,...,trend,day,month,hour,minute,second,mentions_count,reply_to_count,hashtags_count,interaccion
0,2020-02-24,21:46:20,mas_que_pelotas,”vi joaõ muchisimas ganas ojala vuelva tope” a...,juanma6sanchez,1,1,0,1,4,...,#AtletiVillarreal,24,2,21,46,20,1,1,1,5
1,2020-02-24,21:45:00,mas_que_pelotas,”correa siendo clave buen rendimiento ultimame...,juanma6sanchez,1,1,0,1,6,...,#AtletiVillarreal,24,2,21,45,0,1,1,1,7
2,2020-02-24,21:44:15,mas_que_pelotas,”por fin siento identificado equipo volvimos p...,juanma6sanchez,1,1,0,1,1,...,#AtletiVillarreal,24,2,21,44,15,1,1,1,2
3,2020-02-24,17:40:01,atletico_md,thomas iguala simeone partidos jugados atletic...,,1,1,0,6,107,...,#AtletiVillarreal,24,2,17,40,1,1,1,3,113
4,2020-02-24,15:30:00,laligatvbar,atletivillarreal remonta golazo suficiente sub...,"atleti,metropolitano,paco93alcacer",1,1,0,0,5,...,#AtletiVillarreal,24,2,15,30,0,3,1,1,5


In [17]:
#df.to_csv("tweets_24_tendencia_preprocesado.csv", sep = ";", index = False)