In [1]:
#Import libraries for NLTK
import spacy
import nltk
from nltk.corpus import stopwords
import re

In [2]:
#loading pre-define language model from english
nlp = spacy.load('en_core_web_sm')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
#Import libraries for Twitter API
import time
import webbrowser
import tweepy

## Setting up API Keys

## Authenticating - Getting User-pin

In [9]:
#Finalizing authentication
api = tweepy.API(auth)

# **Accesing Twitter API**


---



## 1) Initializing list to store raw tweets

In [10]:
tweets = []

## 2) Setting up Listener Function to extract tweets

In [11]:
class MyStreamListener(tweepy.StreamListener):
    def __init__(self, time_limit=1):
        self.start_time = time.time()
        self.limit = time_limit
        super(MyStreamListener, self).__init__()

    def on_status(self, status):
        tweets.append(status.text)

        if (time.time() - self.start_time) < self.limit:
            return True
        else:
            return False

## 3) Initializing Streamer with topic:

In [12]:
myStream = tweepy.Stream(auth=api.auth, listener=MyStreamListener(time_limit=30))
myStream.filter(track=['Donald Trump'])

## 4) Visualizing Raw Tweets

In [13]:
tweets

['RT @CaucasianJames: i’m so much cooler than donald trump',
 "That's that's the best way",
 'RT @PinkNews: Biden supporters celebrate in the street singing YMCA, reclaiming the gay anthem from soon-to-be former president Trump https…',
 'WOW THE CRIES OF DESPERATION! cnn KNOW THEIR GOING DOWN WITH mafiosa joe!!!',
 '@tattoosandbones @mschlapp 18 has always been the voting age. President Donald J Trump voted in person. Investigate… https://t.co/hmrPWptJvn',
 'sounds a bit fake newsy for me...dunno, is this in the Enquirer? must be true then 😅\n\nMelania to divorce Donald Tru… https://t.co/tWTWiKa6rJ',
 "RT @Mediavenir: 🇺🇸 ALERTE - Donald #Trump s'est de nouveau mis en colère sur #Twitter ce soir : « Depuis quand les médias de masse désignen…",
 '📊 Se tu avessi votato alle elezioni americane del 4 Novembre 2020, per chi avresti votato? \nhttps://t.co/poEQ7AoOgn… https://t.co/UmkNCJGAjz',
 '@col_malena @auto_goon @TheRightMelissa Yup. I do remember how you guys cried a four years ago. \n

# **Preprocessing Tweets**


---



*   Removing links
*   Remove special characters
*   Remove emojis
*   Converting to lower case
*   Removing stop words



In [14]:
def preprocess(tweets):

    for tweet in range(len(tweets)): 
    
        # Remove urls
        letters_only_text = re.sub(r"https:(\/\/t\.co\/([A-Za-z0-9]|[A-Za-z]){10})", '', tweets[tweet], flags=re.MULTILINE)

        # keep only words
        letters_only_text = re.sub(r'[^a-zA-Z\s]', " ", letters_only_text)

        # convert to lower case and split 
        words = letters_only_text.lower().split()
        
        # remove stopwords
        additional  = ['rt','rts','retweet']
        stopword_set = set().union(stopwords.words('english'),additional)
        meaningful_words = [w for w in words if w not in stopword_set]
        
        # join the cleaned words in a list
        tweets[tweet] = " ".join(meaningful_words)

    return tweets

In [15]:
#Viewing the processed tweets
preprocess(tweets)


['caucasianjames much cooler donald trump',
 'best way',
 'pinknews biden supporters celebrate street singing ymca reclaiming gay anthem soon former president trump https',
 'wow cries desperation cnn know going mafiosa joe',
 'tattoosandbones mschlapp always voting age president donald j trump voted person investigate',
 'sounds bit fake newsy dunno enquirer must true melania divorce donald tru',
 'mediavenir alerte donald trump est de nouveau mis en col sur twitter ce soir depuis quand les dias de masse signen',
 'se tu avessi votato alle elezioni americane del novembre per chi avresti votato',
 'col malena auto goon therightmelissa yup remember guys cried four years ago ex',
 'bronhilljr donald trump still president secret service january th',
 'bettybowers hunter biden hurt joe biden election father still loves donald trump jr helped donald trump',
 'independent hillary clinton speech perfectly predicted trump would react losing election',
 'hashimotokotoe pr',
 'lyrixxfn alt reald

## Storing - Label, word and number of times word appears into dictionary of dictionaries




In [16]:
#Initializing empty list to store labels
dictionary_label_word_count = {}

In [17]:
def add_label_word_count_to_dictionary(tweets, dictionary_label_word_count):
  #Iterating through all tweets
  for tweet in range(len(tweets)): 
      
      #Using nlp library to parse out entities
      tweet_entities = nlp(tweets[tweet])

      #Iterating through all entities from tweet
      for entity in tweet_entities.ents:

        # entity.text = the entity,  entity.label_ = type of entity
        #CASE 1 : Label DNE
        if entity.label_ not in dictionary_label_word_count.keys(): 
          dictionary_label_word_count[entity.label_] = {}
          dictionary_label_word_count[entity.label_][entity.text] = 1
        #CASE 2: Label exists 
        else:                                          
          #CASE A: Word DNE - So we just add the a count to 1
          if entity.text not in dictionary_label_word_count[entity.label_].keys():
            dictionary_label_word_count[entity.label_][entity.text] = 1
          #CASE B: Word Exists
          else:
            dictionary_label_word_count[entity.label_][entity.text] += 1
        


In [18]:
add_label_word_count_to_dictionary(tweets, dictionary_label_word_count)

## Declaring function to print entities and counts stored in dictionary

This function will print entities by label, followed by the word and  the number of times the appears in different tweets.

In [23]:
def print_Entity_Count(dictionary_label_word_count):
  #Count the different labels
  labels = 0
  #Count the total number of entities
  entities = 0

  #Print entities
  for label_key in dictionary_label_word_count:
    print('===========================\n',label_key, '\n===========================')
    labels += 1
    for word_key in dictionary_label_word_count[label_key]:
      entities += 1
      print(word_key, dictionary_label_word_count[label_key][word_key])

    #Print new line after every label
    print('\n')
  
  print('Different Labels:', labels, ' Different Entities: ', entities)

# View Entity Counts for Listner set to 30 Seconds

In [24]:
print_Entity_Count(dictionary_label_word_count)

 ORG 
pinknews biden supporters celebrate 3
cnn 40
est de nouveau 39
les dias de masse 37
malena auto goon 3
lyrixxfn alt realdonaldtrump 3
equipo de 3
joyannreid point trump supporters 48
white house 64
colheita de provas 7
fox news 30
fox fix news news 3
glennkirschner 83
brandonstraka dickmorris newsmax 3
jchaufou mythosstory royalmrbadnews sirpork 3
julito one 3
ce qui 12
ce moment 10
darranmarshall biden team steps 6
elturco sus 3
bairrista pol tica internacional 3
volta para 3
su divorcio de donald 4
resistance boum dans 3
les dents des 3
elliottcmorgan 3
luscas freddie mercury se revirando 25
raciesicilian years 20
mediavenir alerte info melania 64
ace ventura le gusta 3
prem league 3
digital 3
jr refuses 3
originator university 3
lidiajozkowicz 3
media pick 3
mjliu conniecchristia ivankatrump 3
al pach 3
los tendr 3
contra 12
ret rica 3
jilldlawrence normornstein 4
sandraborda riesgo de terminar pero en 3
una 4
marcacaputo one 12
lebanon criminals exchange 3
muysufrida es casua

# View Entity Counts for Listner set to 2 Minutes

In [25]:
#Initialize
myStream = tweepy.Stream(auth=api.auth, listener=MyStreamListener(time_limit=120))
myStream.filter(track=['Donald Trump'])

#Processed tweets
preprocess(tweets)

#Add label, word and counts to dictionary from these tweets
add_label_word_count_to_dictionary(tweets, dictionary_label_word_count)

#print word and counts grouped by label
print_Entity_Count(dictionary_label_word_count)

 ORG 
pinknews biden supporters celebrate 4
cnn 85
est de nouveau 64
les dias de masse 61
malena auto goon 4
lyrixxfn alt realdonaldtrump 4
equipo de 4
joyannreid point trump supporters 90
white house 128
colheita de provas 12
fox news 60
fox fix news news 4
glennkirschner 163
brandonstraka dickmorris newsmax 4
jchaufou mythosstory royalmrbadnews sirpork 4
julito one 5
ce qui 18
ce moment 15
darranmarshall biden team steps 9
elturco sus 4
bairrista pol tica internacional 4
volta para 4
su divorcio de donald 7
resistance boum dans 4
les dents des 4
elliottcmorgan 4
luscas freddie mercury se revirando 45
raciesicilian years 33
mediavenir alerte info melania 128
ace ventura le gusta 4
prem league 4
digital 4
jr refuses 4
originator university 4
lidiajozkowicz 4
media pick 4
mjliu conniecchristia ivankatrump 4
al pach 4
los tendr 4
contra 18
ret rica 4
jilldlawrence normornstein 6
sandraborda riesgo de terminar pero en 4
una 7
marcacaputo one 19
lebanon criminals exchange 4
muysufrida es c

# View Entity Counts for Listner set to 5 Minutes

In [26]:
#Initialize
myStream = tweepy.Stream(auth=api.auth, listener=MyStreamListener(time_limit=300))
myStream.filter(track=['Donald Trump'])

#Viewing the processed tweets
preprocess(tweets)

#Add label, word and counts to dictionary
add_label_word_count_to_dictionary(tweets, dictionary_label_word_count)

#print word and counts grouped by label
print_Entity_Count(dictionary_label_word_count)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
pinknews biden supporters celebrate 5
cnn 163
est de nouveau 93
les dias de masse 89
malena auto goon 5
lyrixxfn alt realdonaldtrump 5
equipo de 6
joyannreid point trump supporters 150
white house 218
colheita de provas 19
fox news 107
fox fix news news 5
glennkirschner 288
brandonstraka dickmorris newsmax 5
jchaufou mythosstory royalmrbadnews sirpork 5
julito one 7
ce qui 25
ce moment 21
darranmarshall biden team steps 13
elturco sus 5
bairrista pol tica internacional 5
volta para 5
su divorcio de donald 10
resistance boum dans 5
les dents des 5
elliottcmorgan 5
luscas freddie mercury se revirando 74
raciesicilian years 65
mediavenir alerte info melania 211
ace ventura le gusta 5
prem league 5
digital 5
jr refuses 5
originator university 5
lidiajozkowicz 5
media pick 5
mjliu conniecchristia ivankatrump 5
al pach 5
los tendr 5
contra 26
ret rica 5
jilldlawrence normornstein 8
sandraborda riesgo de terminar pero en 5
una 1

# View Entity Counts for Listner set to 7.5 Minutes

In [27]:
#Initialize
myStream = tweepy.Stream(auth=api.auth, listener=MyStreamListener(time_limit=450))
myStream.filter(track=['Donald Trump'])

#Viewing the processed tweets
preprocess(tweets)

#Add label, word and counts to dictionary
add_label_word_count_to_dictionary(tweets, dictionary_label_word_count)

#print word and counts grouped by label
print_Entity_Count(dictionary_label_word_count)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
pedig hozz 6
k zdelm 6
rv nyt 6
jim carrey joe biden 11
trump loooo ser 7
violette johan 14
louisrielfrance je 6
obliger ce que mes imp ts 6
kamalaharris kamala 14
joe 98
mbrdelcielo jim carrey lo 6
adrienne fox amerindivis jaketapper 6
quando grande 50
presidente indagou 50
kenneth 22
joe creating generation 22
projectlincoln donald 9
pero aqu espa 17
illness lmfaoo 6
g nnt hate 6
ben franklin 6
quipe ont instrumentalis la communaut des chr 6
favdockofshame donald 29
vossysen andrewvossy donald 6
lo dice la 6
gringa est n convencidos de que hubo trampa el 6
actuel nateur 53
tes sur le 53
yea 17
nerizilber netanyahu 15
primero donald trump 6
segundo aitana 6
royblunt donald 6
podliczaj 6
wszystkie osi 6
ga g os w 6
aprendido mucho 18
menschen k mmern 5
gestimmt haben 5
steveschmidtses donald trump 64
drpaulgosar liljon 5
josh hammer 226
rouhani 8
alex trebek 7
peter 7
bruh lmao 5
gwgoldnadel ne le 55
kitchenette il 55
jo 

# View Entity Counts for Listner set to 10 Minutes

In [28]:
#Initialize
myStream = tweepy.Stream(auth=api.auth, listener=MyStreamListener(time_limit=600))
myStream.filter(track=['Donald Trump'])

#Viewing the processed tweets
preprocess(tweets)

#Add label, word and counts to dictionary
add_label_word_count_to_dictionary(tweets, dictionary_label_word_count)

#print word and counts grouped by label
print_Entity_Count(dictionary_label_word_count)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
jugando golf creen que 5
algunos miembros clave de la opep 5
que las 8
kekalahan donald 7
christopherhope yo daughter 5
referendum brexit 5
redo 5
katzenkotzenoft neben donald 5
erscheint alles 5
theclassicfon para 5
forma que donald 5
de ge 5
vas donald 5
du quartier 5
meu deus 9
royblunt thisweekabc 5
forsythjenn 5
ptbrasil lula 17
afirma que mundo 17
que derrotou 17
co q zef 17
snl joe biden 5
emron joe donald biden 5
fani kayode 11
puede ser mi dolo 5
ntos de qu tama 5
mmnjug donald 5
lose trump 5
pauleyteeks donald trump 5
feminisciencia abundan tw 5
de un hombre 5
hielo por marcos mel ndez 5
kurteichenwald donald 5
charleeoddie 5
joe biden story motivational 5
koeppelroger der rechtsstaat 5
refu 5
mentiras nossa meta 5
bidenharis 7
la ser es hora de ser un buen perdedor 24
jasondashbailey 5
donald nepotism 8
rpdandy therickwilson ruthfordkarla newt gingrich 5
vatican 5
melania trump 8
para ace 5
jacksfilms hello don