In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import json
import re
import string
from collections import Counter

from nltk.corpus import stopwords

In [2]:
def to_dict(string):
    if string != "[]":
        string = json.loads(string.replace("'", "\""))
        return ",".join([s["screen_name"] for s in string])
    return ""

def to_list(list_):
    if list_ != "[]":
        list_ = list_[1:-1]
        list_ = list_.split(",")
        return ",".join([s.strip().strip("'") for s in list_])
    return ""

def normalize(s):
    replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"))
    for a, b in replacements:
        s = s.lower()
        s = s.replace(a, b)
    return s

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0000270D"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r"", text)

def cleanTxt(text):
    text = re.sub(r"@[a-zA-Z0-9]+", "", text) #Removes @mentions
    text = re.sub(r"#", "", text) #Removing the "#" symbol
    text = re.sub(r"RT[\s]+", "", text) #Removing RT
    text = re.sub(r"https?:\/\/\S+", "", text) #Remove the hyperlink
    return text

def replace_punct(s):
    for i in string.punctuation:
        if i in s:
            s = s.replace(i, "").strip()
    return s

def replace_num(s):
    for i in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
        s = s.replace(i, "")
    return s

def preprocessor(text):
    text = re.sub(r"[\W]+", "", text.lower()) 
    return text

def tokenizador(text):
    important_words = []
    for word in text.split(" "):
        if word not in stopwords.words("spanish"):
            if word != "":
                important_words.append(word)
    return " ".join(important_words).strip()

def foo(text):
    forbidden = ("?", "¿", "¡", "!", ",", ".", ";", ":", "-", "'", "+", "$", "/", "*",'«','»', "~", "(", ")")
    aux = ""
    for v in text:
        if not v in forbidden:
            aux += v
    return aux

def quitar_abreviaciones(text):
    abreviaciones = {"ue" : "union europea", 
                     "pp" : "partido popular", 
                     "tc" : "tribunal constitucional", 
                     "no" : "no",
                     "si" : "si", 
                     "iu" : "izquierda unida", 
                     "cs" : "ciudadanos"}
    aux = ""
    for word in text.split(" "):
        if word in abreviaciones.keys():
            aux += abreviaciones[word] + " "
        else: aux += word + " "
    return aux.strip()

In [3]:
df = pd.read_csv("C:/Users/Daniel/Desktop/proyecto/notebooks/tweets_periodicos.csv")
df.drop(df.columns[0], axis = 1, inplace = True)

In [4]:
columns_to_drop = ["conversation_id", "cashtags", "timezone", "user_id", "name", "thumbnail", "created_at", "id", "link"]

df.drop(columns_to_drop, axis = 1, inplace = True)

df = df[df.language == "es"]

df.drop("language", axis = 1, inplace = True)

df = df.reset_index().drop("index", axis = 1)

In [5]:
%%time
df.tweet = df.tweet.apply(normalize)
df.tweet = df.tweet.apply(deEmojify)
df.tweet = df.tweet.apply(cleanTxt)
df.tweet = df.tweet.apply(replace_punct)
df.tweet = df.tweet.apply(replace_num)

df.tweet = df.tweet.apply(tokenizador)
df.tweet = df.tweet.apply(foo)
df.tweet = df.tweet.apply(quitar_abreviaciones)

KeyboardInterrupt: 

In [6]:
df.shape()

TypeError: 'tuple' object is not callable

In [None]:
df.head()

In [None]:
df.to_csv("tweets_periodicos_procesados.csv", sep = ";", index = False)