# Clasificación del sentimiento en Tweets de Donald Trump

## Análisis descriptivo de los datos

In [None]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import import_ipynb
from sklearn.externals import joblib

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
cols = ['source','text','created_at','retweet_count','favorite_count','is_retweet','id_str']

In [None]:
df_Trump_Initial = pd.read_csv("./DataSheet_..trumptwitterarchive_com_archive.csv",header=0,delimiter=';', encoding='latin-1')

In [None]:
df_Trump_Initial.head()

In [None]:
df_Trump_Initial.info()

In [None]:
del(df_Trump_Initial['Unnamed: 7'])
del(df_Trump_Initial['Unnamed: 8'])
del(df_Trump_Initial['Unnamed: 9'])
del(df_Trump_Initial['Unnamed: 10'])
df_Trump_Initial.info()

In [None]:
df_Trump_Initial_Date=df_Trump_Initial.created_at
df_Trump_Initial_Date

In [None]:
df_Trump_ToClean=pd.DataFrame({'mensaje': df_Trump_Initial.text, 'sentimiento': ""})
df_Trump_ToClean

In [None]:
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
import re as patternToDelete
from bs4 import BeautifulSoup

In [None]:
import unicodedata
from unidecode import unidecode

def deEmojify(inputString):
    returnString = ""
    for character in inputString:
        try:
            character.encode("ascii")
            returnString += character
        except UnicodeEncodeError:
            returnString += ''
    return returnString

In [None]:
patternToDelete1 = r'@[A-Za-z0-9_]+'
patternToDelete2 = r'https?://[^ ]+'
patternToDelete3 = r'rt? +'
patron = r'|'.join((patternToDelete1, patternToDelete2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
patronNegacion = patternToDelete.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
i=0
def limpieza_tweets(mensaje):
   
    #patternToDelete.sub('https?://[A-Za-z0-9./]+','',mensaje)
    out = BeautifulSoup(patternToDelete.sub('https?://[A-Za-z0-9./]+','',mensaje), 'lxml')
    out_souped = out.get_text()
    
    try:
        bom_borrado = out_souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_borrado = out_souped
        
    emoji_borrado = deEmojify(bom_borrado)
    mensajeAux = patternToDelete.sub(patron, '', emoji_borrado)  
    mensajeAux = patternToDelete.sub(www_pat, '', mensajeAux)
    lmensajeAux_lower_case = mensajeAux.lower()
    neg_handled = patronNegacion.sub(lambda x: negations_dic[x.group()], lmensajeAux_lower_case)
    mensajeSoloLetras = patternToDelete.sub("[^a-zA-Z]", " ", neg_handled)
    mensajeSoloLetras2 = patternToDelete.sub(patternToDelete3, " ", mensajeSoloLetras)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(mensajeSoloLetras2) if len(x) > 1]
    return (" ".join(words)).strip()


In [None]:
%%time
print ("Limpiando y parseando todos los tweets...\n")
print(df_Trump_ToClean)
tweetsLimpios = []
i=0
while i < len(df_Trump_ToClean):
    if( (i+1)%1000 == 0 ):
        print ("Tweets número %d de %d han sido procesados" % ( i+1, len(df_Trump_ToClean) ))                                                                    
    tweetsLimpios.append(limpieza_tweets(df_Trump_ToClean['mensaje'][i]))
    
    i += 1

In [None]:
tweetsLimpios

Procedo a la carga del modelo que mejores resultados me ha suministrado

In [None]:
modelo_t_tfidf=joblib.load('.\Modelos Entrenados\modelo_entrenado_t_tfidf.pkl')

In [None]:
modelo=modelo_t_tfidf
y_pred_Trump = modelo.predict(tweetsLimpios)

In [None]:
y_pred_Trump

In [None]:
df_Trump_Classified=pd.DataFrame({'mensaje': tweetsLimpios, 'sentimiento': y_pred_Trump})



In [None]:
df_Trump_Classified.info()

Añado la columna fecha y reorganizo el dataframe.

In [None]:
df_Trump_Classified["fecha"]=df_Trump_Initial_Date
cols = df_Trump_Classified.columns.tolist()
cols = ['mensaje','fecha','sentimiento']
df_Trump_Classified= df_Trump_Classified[cols]

In [None]:
df_Trump_Classified

In [None]:
df_Trump_Classified = df_Trump_Classified.drop(df_Trump_Classified[df_Trump_Classified['mensaje']==''].index)

In [None]:
df_Trump_Classified.to_csv("trumpAUX.csv")

In [None]:
df_Trump_Classified = pd.read_csv("trumpAUX.csv",index_col=0)

## Word Cloud

In [None]:

neg_tweets = df_Trump_Classified[df_Trump_Classified.sentimiento == 0]
neg_string = []
for t in neg_tweets.mensaje:
    neg_string.append(t)
neg_string = pd.Series(neg_string).str.cat(sep=' ')

In [None]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from matplotlib.colors import LinearSegmentedColormap

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (16.0, 9.0)

mask = np.array(Image.open("trump.jpg"))
colors = ["#BF0A30", "#002868"]
cmap = LinearSegmentedColormap.from_list("mycmap", colors)

# Create WordCloud Object
wc = WordCloud(background_color="white",
                 width=853, height=506, mask=mask, colormap=cmap)
wc.generate(neg_string)


plt.figure()
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")

In [None]:
pos_tweets = df_Trump_Classified[df_Trump_Classified.sentimiento == 1]
pos_string = []
for t in pos_tweets.mensaje:
    pos_string.append(t)
pos_string = pd.Series(pos_string).str.cat(sep=' ')

In [None]:
colors = ["#BF0A30", "#002868"]
cmap = LinearSegmentedColormap.from_list("mycmap", colors)

# Create WordCloud Object
wc = WordCloud(background_color="white",
                 width=853, height=506, mask=mask, colormap=cmap)
wc.generate(pos_string)


plt.figure()
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")

## Visualización de datos

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
#Convert a collection of text documents to a matrix of token counts
cvec = CountVectorizer(stop_words='english')
cvec.fit(df_Trump_Classified.mensaje)

In [None]:
len(cvec.get_feature_names())

CountVectorizer ha extraido 16457 parabras del corpus

In [None]:
cvec.get_feature_names()

In [None]:
neg_doc_matrix = cvec.transform(df_Trump_Classified[df_Trump_Classified.sentimiento == 0].mensaje)
pos_doc_matrix = cvec.transform(df_Trump_Classified[df_Trump_Classified.sentimiento == 1].mensaje)


In [None]:
neg_tf = np.sum(neg_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)

In [None]:
#squeeze function is used when we want to remove single-dimensional entries from the shape of an array.
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))
term_freq_df = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()
type(term_freq_df)

In [None]:
names = term_freq_df.columns.tolist()
names

In [None]:
names = term_freq_df.columns.tolist()
names[names.index(0)] = 'negativo'
names[names.index(1)] = 'positivo'
term_freq_df.columns = names

In [None]:
term_freq_df["Total"]=term_freq_df.sum(axis=1)
term_freq_df

In [None]:
term_freq_df.to_csv('term_freq_df.csv',encoding='utf-8')

In [None]:
term_freq_df.sort_values(by='Total', ascending=False)

In [None]:
y_pos = np.arange(500)
plt.figure(figsize=(10,8))
s = 1
expected_zipf = [term_freq_df.sort_values(by='Total', ascending=False)['Total'][0]/(i+1)**s for i in y_pos]
plt.bar(y_pos, term_freq_df.sort_values(by='Total', ascending=False)['Total'][:500], align='center', alpha=0.5)
plt.plot(y_pos, expected_zipf, color='r', linestyle='--',linewidth=2,alpha=0.5)
plt.ylabel('Frequency')
plt.title('Top 500 tokens in tweets')

Par la visualización de los tokens en los tweets de Trump voy a proceder a eliminar las stopword ya que me restan significado al análisis que quiero mostrar

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(stop_words='english',max_features=10000)
cvec.fit(df_Trump_Classified.mensaje)

In [None]:
document_matrix = cvec.transform(df_Trump_Classified.mensaje)

In [None]:
%%time
neg_batches = np.linspace(0,len(df_Trump_Classified)/2,10).astype(int)
i=0
neg_tf = []
while i < len(neg_batches)-1:
    batch_result = np.sum(document_matrix[neg_batches[i]:neg_batches[i+1]].toarray(),axis=0)
    neg_tf.append(batch_result)
    print (neg_batches[i+1],"entries' term freuquency calculated")
    i += 1

In [None]:
%%time
pos_batches = np.linspace(len(df_Trump_Classified)/2,len(df_Trump_Classified),10).astype(int)
i=0
pos_tf = []
while i < len(pos_batches)-1:
    batch_result = np.sum(document_matrix[pos_batches[i]:pos_batches[i+1]].toarray(),axis=0)
    pos_tf.append(batch_result)
    print (pos_batches[i+1],"entries' term freuquency calculated")
    i += 1

In [None]:
neg = np.sum(neg_tf,axis=0)
pos = np.sum(pos_tf,axis=0)
term_freq_df2 = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()
term_freq_df2.columns = ['negativo', 'positivo']
term_freq_df2['Total'] = term_freq_df2['negativo'] + term_freq_df['positivo']
term_freq_df2.sort_values(by='Total', ascending=False).iloc[:10]

Las 40 palabras más negativas

In [None]:
y_pos = np.arange(40)
plt.figure(figsize=(12,10))
plt.bar(y_pos, term_freq_df.sort_values(by='negativo', ascending=False)['negativo'][:40], align='center', alpha=0.5)
plt.xticks(y_pos, term_freq_df.sort_values(by='negativo', ascending=False)['negativo'][:40].index,rotation='vertical')
plt.ylabel('Frequencia')
plt.xlabel('Los 40 tokens más negativos')
plt.title('Los 40 tokens más negativos en tweets')

In [None]:
y_pos = np.arange(40)
plt.figure(figsize=(12,10))
plt.bar(y_pos, term_freq_df.sort_values(by='positivo', ascending=False)['positivo'][:40], align='center', alpha=0.5)
plt.xticks(y_pos, term_freq_df.sort_values(by='positivo', ascending=False)['positivo'][:40].index,rotation='vertical')
plt.ylabel('Frequencia')
plt.xlabel('Los 40 tokens más positivos')
plt.title('Los 40 tokens más positivos en tweets')

In [None]:
df_Trump_Classified.to_csv('Trump_SA_For_LSTM.csv',encoding='utf-8')

In [None]:
df_Trump_Classified

In [None]:
df = df_Trump_Classified['sentimiento'].value_counts()

In [None]:
print("Porcentaje de tweets positivos: {}%".format(df[1]*100/len(df_Trump_Classified)))
print("Porcentaje de tweets negativos: {}%".format(df[0]*100/len(df_Trump_Classified)))

In [None]:
import plotly.graph_objects as go
labels=["Porcentaje de tweets positivos", "Porcentaje de tweets negativos"]
values = [df[1]*100/len(df_Trump_Classified), df[0]*100/len(df_Trump_Classified)]

# Use `hole` to create a donut-like pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.show()