<img src='https://encrypted-tbn0.gstatic.com/images?q=tbn%3AANd9GcQ-VfNtOyJbsaxu43Kztf_cv1mgBG6ZIQZEVw&usqp=CAU'>

# Procesamiento de Lenguage Natural

## Taller #11: Twitter
`Fabián Castro`

###  `[25 pts]` Punto 1: Extraer Tuits
Extraer mínimo 100 tuits de Twitter usando la API

In [209]:
#misc
import re
import json
from collections import Counter

#dataframe handling
import pandas as pd
import numpy as np

#data adquisition
import tweepy

#pre-processing
from nltk.corpus import stopwords

#feature extraction
import scipy.cluster.hierarchy as sch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

stopwords_sp = stopwords.words('spanish')

#visualization
import plotly.io as pio
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.pyplot as plt
%matplotlib inline
init_notebook_mode(connected=True)

In [8]:
with open('../Access/client_secret.json') as file:
    CREDS = json.load(file)

In [108]:
# Auth v1
auth = tweepy.OAuthHandler(CREDS['api_key'], CREDS['api_secret_key'])
auth.set_access_token(CREDS['access_token'], CREDS['access_token_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True)

In [146]:
query_text = '((COVID-19 OR Coronavirus OR COVID) Colombia) -"RT @"'

In [190]:
params = dict(environment_name = 'DevEnvironTest',
         query = query_text,
         fromDate = '202012010000',
         toDate = '202012310000',
         maxResults = 100)

#Cursor
tweets = tweepy.Cursor(api.search_full_archive, **params).items(200)

In [191]:
t_processed = []

for tweet in tweets:
    temp = []
    if tweet.truncated:
        temp.append(tweet.extended_tweet['full_text'])
    else:
        temp.append(tweet.text)
    temp.append(tweet.created_at)
    temp.append(tweet.user.location)
    temp.append(tweet.id)
    temp.append(f"https://twitter.com/i/web/status/{tweet.id}")
    
    t_processed.append(temp)

In [193]:
data = pd.DataFrame(t_processed)
data.columns = ['tweet', 'date', 'user_location', 'id', 'link']
display(data.head())
data.shape

Unnamed: 0,tweet,date,user_location,id,link
0,@CaroGC @NoticiasCaracol Te comparto la logíst...,2020-12-30 23:59:39,Medellín,1344432998536654849,https://twitter.com/i/web/status/1344432998536...
1,🚨🦠 Colombia de aproxima a los 43.000 fallecido...,2020-12-30 23:59:09,Riohacha,1344432871021441033,https://twitter.com/i/web/status/1344432871021...
2,Los orígenes de la aviación en fotos \n \nhttp...,2020-12-30 23:59:00,🇪🇨🇺🇸,1344432832219738118,https://twitter.com/i/web/status/1344432832219...
3,Coronavirus | En Colombia se confirman 1.626.4...,2020-12-30 23:58:34,"Cartagena, Colombia",1344432723230928896,https://twitter.com/i/web/status/1344432723230...
4,El presidente Iván Duque anunció que #Colombia...,2020-12-30 23:58:32,Bucaramanga,1344432715421143040,https://twitter.com/i/web/status/1344432715421...


(200, 5)

In [199]:
pattern1 = '?P<pic>pic.twitter.com/[^\s]+'
pattern2 = '?P<url>https?://[^\s]+'

def remove_words_by_start_char(text, character):
    """Based on a starting character (i.e. @ or #),
        removes the word (with character included) from the text and returns all ocurrences"""
    spec_words = [interaction for interaction in text.split() if interaction.startswith(character)]
    for word in spec_words:
        text = text.replace(word, '')
    return text, spec_words
    
def text_clean(row):
    text = row['tweet']
    
    links = [tuple(j for j in i if j)[-1] for i in re.findall(f"({pattern1})|({pattern2})",text)]
    for link in links:
        text = text.replace(link,"")
         
    text, hashtags = remove_words_by_start_char(text, '#')
    text, mentions = remove_words_by_start_char(text, '@')    
        
    return text, links, hashtags, mentions

In [200]:
data[['clean', 'links', 'hashtags', 'mentions']] = data.apply(text_clean, axis=1, result_type='expand')

data.head()

Unnamed: 0,tweet,date,user_location,id,link,clean,links,hashtags,mentions
0,@CaroGC @NoticiasCaracol Te comparto la logíst...,2020-12-30 23:59:39,Medellín,1344432998536654849,https://twitter.com/i/web/status/1344432998536...,"Te comparto la logística 😘 de la ""dictadura""...",[https://t.co/QtQHRcI4dH],[],"[@CaroGC, @NoticiasCaracol]"
1,🚨🦠 Colombia de aproxima a los 43.000 fallecido...,2020-12-30 23:59:09,Riohacha,1344432871021441033,https://twitter.com/i/web/status/1344432871021...,🚨🦠 Colombia de aproxima a los 43.000 fallecido...,"[https://t.co/xyKFFJ2m5u, https://t.co/PpvGe4Z...",[],[]
2,Los orígenes de la aviación en fotos \n \nhttp...,2020-12-30 23:59:00,🇪🇨🇺🇸,1344432832219738118,https://twitter.com/i/web/status/1344432832219...,Los orígenes de la aviación en fotos \n \n \n\...,"[https://t.co/SQybmE5jWq, https://t.co/UUFyWHs...","[#planespotting, #avion, #coronavirus, #Ecuado...",[]
3,Coronavirus | En Colombia se confirman 1.626.4...,2020-12-30 23:58:34,"Cartagena, Colombia",1344432723230928896,https://twitter.com/i/web/status/1344432723230...,Coronavirus | En Colombia se confirman 1.626.4...,[https://t.co/g6chO6nVbF],[],[]
4,El presidente Iván Duque anunció que #Colombia...,2020-12-30 23:58:32,Bucaramanga,1344432715421143040,https://twitter.com/i/web/status/1344432715421...,El presidente Iván Duque anunció que cerró un...,"[https://t.co/3hIKqqhwOG, https://t.co/tUkh1tS...",[#Colombia],[]


###  `[25 pts]` Punto 2: Hacer una visualización sobre esos tuits

### Similarity

In [206]:
def pre_procesado(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = " ".join([palabra for palabra in texto.split() if palabra not in stopwords_sp])
    return texto

tfidf_vect = TfidfVectorizer(preprocessor=pre_procesado)
tfidf = tfidf_vect.fit_transform(data['clean'].values)

voc = [k for k,v in sorted(tfidf_vect.vocabulary_.items(), key=lambda kv: kv[1])]
temp = pd.DataFrame(tfidf.toarray())
temp = cosine_similarity(temp.values)
temp = pd.DataFrame(temp)
# temp.columns = data.tweet
# temp.index = data.tweet
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,1.000000,0.015844,0.0,0.019124,0.064435,0.005408,0.0,0.000000,0.000000,0.005319,...,0.006582,0.004003,0.023408,0.000000,0.000000,0.000000,0.002938,0.009192,0.000000,0.000000
1,0.015844,1.000000,0.0,0.184500,0.000000,0.103383,0.0,0.000000,0.000000,0.010463,...,0.128372,0.007874,0.225829,0.068091,0.000000,0.000000,0.005779,0.018080,0.000000,0.000000
2,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.019124,0.184500,0.0,1.000000,0.000000,0.012841,0.0,0.000000,0.000000,0.012630,...,0.161525,0.009504,0.445528,0.442523,0.000000,0.000000,0.006976,0.089733,0.000000,0.000000
4,0.064435,0.000000,0.0,0.000000,1.000000,0.123150,0.0,0.000000,0.000000,0.005924,...,0.000000,0.004457,0.000000,0.000000,0.021416,0.009127,0.003272,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.000000,0.000000,0.0,0.000000,0.009127,0.007067,0.0,0.000000,0.000000,0.006951,...,0.000000,0.005231,0.000000,0.000000,0.025132,1.000000,0.003839,0.000000,0.000000,0.000000
196,0.002938,0.005779,0.0,0.006976,0.003272,0.004506,0.0,0.000000,0.000000,0.004432,...,0.002401,0.249570,0.008539,0.000000,0.009009,0.003839,1.000000,0.003353,0.000000,0.000000
197,0.009192,0.018080,0.0,0.089733,0.000000,0.057251,0.0,0.000000,0.000000,0.006070,...,0.099121,0.004568,0.109834,0.089361,0.181638,0.000000,0.003353,1.000000,0.000000,0.000000
198,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.194452,0.039055,0.000000,...,0.116152,0.000000,0.243009,0.064402,0.139711,0.000000,0.000000,0.000000,1.000000,0.097414


In [207]:
trace = go.Heatmap(z=temp.values.tolist(),
                   x=temp.index.values,
                   y=temp.columns.values,
                   colorscale='Oranges')

layout = go.Layout(title='🔥 Mapa de calor entre tweets 🔥',
                   width=800, height=800)

fig = go.Figure(data=[trace],layout=layout)
iplot(fig)

In [210]:
pairwise_distances = sch.distance.pdist(temp)
linkage = sch.linkage(pairwise_distances, method='ward')
idx_to_cluster_array = sch.fcluster(linkage, pairwise_distances.max() * 0.5, criterion='distance')
idx = np.argsort(idx_to_cluster_array)
temp = temp.copy()
    
temp2 = temp.iloc[idx, :].T.iloc[idx, :]
my_idx = idx_to_cluster_array

temp2.index = ['tweet #'+str(i) for i in temp2.columns]
temp2.columns = temp2.index

#visual
trace = go.Heatmap(z=temp2.values.tolist(),
                   x=temp2.index.values,
                   y=temp2.columns.values,
                   colorscale='Oranges')

layout = go.Layout(title='🔥 Mapa de calor entre tweets (ordenado) 🔥',
                  width=800, height=800)

fig = go.Figure(data=[trace],layout=layout)
iplot(fig)