In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import json
import re
import string
from collections import Counter

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
#y_pred = cross_val_predict(clf, x, y, cv=10)
#conf_mat = confusion_matrix(y, y_pred)

In [2]:
def to_dict(string):
    if string != "[]":
        string = json.loads(string.replace("'", "\""))
        return ",".join([s["screen_name"] for s in string])
    return ""

def to_list(list_):
    if list_ != "[]":
        list_ = list_[1:-1]
        list_ = list_.split(",")
        return ",".join([s.strip().strip("'") for s in list_])
    return ""

def normalize(s):
    replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"))
    for a, b in replacements:
        s = s.lower()
        s = s.replace(a, b)
    return s

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0000270D"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r"", text)

def cleanTxt(text):
    text = re.sub(r"@[a-zA-Z0-9]+", "", text) #Removes @mentions
    text = re.sub(r"#", "", text) #Removing the "#" symbol
    text = re.sub(r"RT[\s]+", "", text) #Removing RT
    text = re.sub(r"https?:\/\/\S+", "", text) #Remove the hyperlink
    return text

def replace_punct(s):
    for i in string.punctuation:
        if i in s:
            s = s.replace(i, "").strip()
    return s

def replace_num(s):
    for i in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
        s = s.replace(i, "")
    return s

def preprocessor(text):
    text = re.sub(r"[\W]+", "", text.lower()) 
    return text

def tokenizador(text):
    important_words = []
    for word in text.split(" "):
        if word not in stopwords.words("spanish"):
            if word != "":
                important_words.append(word)
    return " ".join(important_words).strip()

def foo(text):
    forbidden = ("?", "¿", "¡", "!", ",", ".", ";", ":", "-", "'", "+", "$", "/", "*",'«','»', "~", "(", ")")
    aux = ""
    for v in text:
        if not v in forbidden:
            aux += v
    return aux

def quitar_abreviaciones(text):
    abreviaciones = {"ue" : "union europea", 
                     "pp" : "partido popular", 
                     "tc" : "tribunal constitucional", 
                     "no" : "no",
                     "si" : "si", 
                     "iu" : "izquierda unida", 
                     "cs" : "ciudadanos"}
    aux = ""
    for word in text.split(" "):
        if word in abreviaciones.keys():
            aux += abreviaciones[word] + " "
        else: aux += word + " "
    return aux.strip()

In [3]:
df_moda = pd.read_csv("C:/Users/Daniel/Documents/GitHub/SuperProyecto/clasificacion/clasificador_moda.csv")
df_moda.drop(df_moda.columns[0], axis = 1, inplace = True)
#df_moda

In [4]:
df = pd.read_csv("C:/Users/Daniel/Documents/GitHub/SuperProyecto/clasificacion/elmundo2.csv")
#df.shape

In [5]:
columns_to_drop = ["conversation_id", "cashtags", "timezone", "user_id", "name", "near", "geo", "source",
                   "user_rt_id", "user_rt", "retweet_id", "retweet_date", "translate", "trans_src",
                   "trans_dest", "place", "quote_url", "thumbnail", "created_at", "id", "link"]

df.drop(columns_to_drop, axis = 1, inplace = True)

df = df[df.language == "es"]

df.drop("language", axis = 1, inplace = True)

df = df.reset_index().drop("index", axis = 1)

In [6]:
df.photos = df.photos.apply(lambda x : 1 if x != "[]" else 0)
df.retweet = df.retweet.apply(lambda x : 1 if x == "True" else 0)
df.urls = df.urls.apply(lambda x : 1 if x != "[]" else 0)

In [7]:
df = pd.concat([df, df_moda.iloc[:, -1]], axis = 1)

In [8]:
df.dropna(inplace = True)

In [9]:
%%time
df.tweet = df.tweet.apply(normalize)
df.tweet = df.tweet.apply(deEmojify)
df.tweet = df.tweet.apply(cleanTxt)
df.tweet = df.tweet.apply(replace_punct)
df.tweet = df.tweet.apply(replace_num)

df.tweet = df.tweet.apply(tokenizador)
df.tweet = df.tweet.apply(foo)
df.tweet = df.tweet.apply(quitar_abreviaciones)

Wall time: 1min 20s


In [10]:
mas_stopwords = []
for word in set(" ".join([tweet for tweet in df.tweet]).split(" ")):
    if len(word) <= 2 and word not in ["si", "no"]:
        mas_stopwords.append(word)
        
def quitar_mas_stopwords(text):
    aux = ""
    for word in text.split(" "):
        if word not in mas_stopwords:
            aux += word + " "
    return aux.strip()

df.tweet = df.tweet.apply(quitar_mas_stopwords)

In [11]:
palabras_unicas = []
for tweet in df.tweet:
    palabras_unicas.extend(tweet.split(" "))
palabras_unicas = set(palabras_unicas)
palabras_unicas = {palabras : 0 for palabras in palabras_unicas}

In [12]:
todos = []
for tweet in df.tweet:
    tweet_dic = palabras_unicas.copy()
    for word, count in Counter(tweet.split(" ")).items():
        tweet_dic[word] = count
    todos.append(tweet_dic)

In [13]:
todos_df = pd.DataFrame(todos, index = range(df.shape[0]))

In [14]:
tfidf = TfidfTransformer()
todos_df = tfidf.fit_transform(todos_df).toarray()

In [15]:
matrix = pd.DataFrame(todos_df, index = range(df.shape[0]))

In [16]:
df2 = df.drop(["date", "time", "username", "tweet", "mentions", "hashtags", "reply_to"], axis = 1)

In [17]:
df2 = pd.concat([matrix, df2], axis = 1)
#df2

In [18]:
X = df2.drop("moda", axis = 1)
y = df2.iloc[:, -1]

In [19]:
X.shape, y.shape

((2000, 6618), (2000,))

In [20]:
#log_model         = LogisticRegression(solver = "liblinear")
#nearest_centroid  = NearestCentroid()
#knn_classifier    = KNeighborsClassifier(11)
#radius_classifier = RadiusNeighborsClassifier(1)
#tree_classifier   = DecisionTreeClassifier() 
#forest_classifier = RandomForestClassifier()

In [21]:
#folds = 5

In [22]:
#%%time
#log     = cross_validate(log_model,         X, y, cv = folds, scoring = make_scorer(accuracy_score))
#nearest = cross_validate(nearest_centroid,  X, y, cv = folds, scoring = make_scorer(accuracy_score)) 
#knn     = cross_validate(knn_classifier,    X, y, cv = folds, scoring = make_scorer(accuracy_score))
#radius  = cross_validate(radius_classifier, X, y, cv = folds, scoring = make_scorer(accuracy_score))
#tree    = cross_validate(tree_classifier,   X, y, cv = folds, scoring = make_scorer(accuracy_score))
#forest  = cross_validate(forest_classifier, X, y, cv = folds, scoring = make_scorer(accuracy_score))

#models_scores_table = pd.DataFrame({"Logistic Regression" : [log["test_accuracy"].mean()],
#                                    "Nearest centroid"    : [nearest["test_accuracy"].mean()],
#                                    "KNN Classifier"      : [knn["test_accuracy"].mean()],
#                                    "Radius Classifier"   : [radius["test_accuracy"].mean()],
#                                    "Decision Tree"       : [tree["test_accuracy"].mean()],
#                                    "Random Forest"       : [forest["test_accuracy"].mean()]},
#                                    index = ["Accuracy"])

In [23]:
#models_scores_table

In [24]:
y.value_counts()/2000*100

0.0    47.1
1.0    35.6
2.0    10.0
3.0     7.3
Name: moda, dtype: float64

In [26]:
forest_classifier = RandomForestClassifier()

In [29]:
forest_classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}