# Building UK Tweets Dataset (CAN)

With the tweets scraped in `4A_scraping_usernames_CAN.ipynb`, we now add the tweets to the DataFrame.

In [20]:
# Import dependancies
import pickle
import pandas as pd
import itertools
from tqdm import tqdm

In [21]:
# Import tweets from saved list .pickle
with open('pickles/users_CAN.pkl', 'rb') as f:
    mynewlist = pickle.load(f)

In [22]:
# Since the list is a list of lists we combine it into a single list (Twitter API could only handle 100 requests/call)
final_list = list(itertools.chain.from_iterable(mynewlist))

# Inspect length
print(len(final_list))

148327


In [23]:
# Import missing tweet IDs from saved list .pickle
with open('pickles/errors_users_CAN.pkl', 'rb') as f:
    errors = pickle.load(f)

In [24]:
# Make into single list
errors = list(itertools.chain.from_iterable(errors))

In [25]:
# Inspect length
print(len(errors))

91


In [26]:
## Check that first element of user ID list used to scrape users matches that of the DataFrame. ##

# Import list from .pickle
with open('pickles/list_user_ids_CAN.pkl', 'rb') as f:
    user_ids = pickle.load(f)

# Make into single list
user_ids = list(itertools.chain.from_iterable(user_ids))

# Inspect first element
print(user_ids[0])

250170548


In [27]:
# Import data
data_import = pd.read_csv("files/cleaned_twitter_data_CAN.csv",index_col=0, dtype = {'user_id': str})

  exec(code_obj, self.user_global_ns, self.user_ns)


In [28]:
# Inspect first five rows
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content
0,1.22196e+18,13752142,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1.0,A,@maurerbot @JustinTrudeau There's already What...,28.0
1,1.221971e+18,1031691314536935424,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0.0,,"""Tracking coronavirus: Map, data and timeline""...",16.0
2,1.221976e+18,31473811,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1.0,F,@AliEhsassi we have a Canadian baby and her mo...,34.0
3,1.221985e+18,1072771,wuhan,2020-01-28 02:36:25,0.436,0.526,0.415,0.232,0.379,-1.0,F,3 are currently under observation in Quebec fo...,38.0
4,1.22199e+18,401098282,wuhan,2020-01-28 02:53:08,0.501,0.436,0.473,0.351,0.365,0.0,,@Franktmcveety Is this the official site for h...,40.0


In [29]:
# Remove rows that we did not find any tweets for
data_import = data_import[~data_import.user_id.isin(errors)]

# Inspect new shape
data_import.shape

(732734, 13)

In [30]:
# Inspect first five elements
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content
0,1.22196e+18,13752142,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1.0,A,@maurerbot @JustinTrudeau There's already What...,28.0
1,1.221971e+18,1031691314536935424,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0.0,,"""Tracking coronavirus: Map, data and timeline""...",16.0
2,1.221976e+18,31473811,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1.0,F,@AliEhsassi we have a Canadian baby and her mo...,34.0
3,1.221985e+18,1072771,wuhan,2020-01-28 02:36:25,0.436,0.526,0.415,0.232,0.379,-1.0,F,3 are currently under observation in Quebec fo...,38.0
4,1.22199e+18,401098282,wuhan,2020-01-28 02:53:08,0.501,0.436,0.473,0.351,0.365,0.0,,@Franktmcveety Is this the official site for h...,40.0


In [31]:
# Inspect length
print(len(user_ids))

148418


In [32]:
# Remove users that we could not scrape username for
wo_error_user_ids = [x for x in user_ids if x not in errors]

In [33]:
# Inspect length
print(len(wo_error_user_ids))

148327


In [34]:
# Check that length matches
print(len(final_list))

148327


In [35]:
# Make dictionary with user IDs as keys and Twitter usernames as value
id_to_name = dict(zip(user_ids, final_list))

In [36]:
# Make new column with usernames
data_import['username'] = data_import['user_id'].map(id_to_name)

In [37]:
# Inspect first five rows
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content,username
0,1.22196e+18,13752142,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1.0,A,@maurerbot @JustinTrudeau There's already What...,28.0,StMinaHamilton
1,1.221971e+18,1031691314536935424,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0.0,,"""Tracking coronavirus: Map, data and timeline""...",16.0,WalkinRobinL
2,1.221976e+18,31473811,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1.0,F,@AliEhsassi we have a Canadian baby and her mo...,34.0,WillWalkerley
3,1.221985e+18,1072771,wuhan,2020-01-28 02:36:25,0.436,0.526,0.415,0.232,0.379,-1.0,F,3 are currently under observation in Quebec fo...,38.0,Road_Buster84
4,1.22199e+18,401098282,wuhan,2020-01-28 02:53:08,0.501,0.436,0.473,0.351,0.365,0.0,,@Franktmcveety Is this the official site for h...,40.0,burlingtonlaw


In [38]:
# Inspect tail
data_import.tail(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content,username
616792,1.285836e+18,282716700,covid,2020-07-22 07:14:08,0.438,0.486,0.438,0.201,0.372,-1.0,F,"""Covid-19 coronavirus: Is Trump's about-turn o...",15.0,2dCale
616793,1.285836e+18,1497174692,covid,2020-07-22 07:15:29,0.33,0.604,0.529,0.157,0.545,-1.0,F,@660NEWS People are still getting sick. The vi...,54.0,downtownalice
616794,1.285836e+18,620886049,covid,2020-07-22 07:15:38,0.485,0.502,0.419,0.307,0.426,0.0,,Over 60 cases of COVID-19 related to Kelowna c...,10.0,maiaswenson
616795,1.285836e+18,1044971320998588417,covid,2020-07-22 07:16:52,0.478,0.441,0.447,0.28,0.424,-1.0,A,COVID-19 caused the greatest damage to the glo...,31.0,Syncwithmoi
616796,1.285836e+18,2388168248,covid,2020-07-22 07:17:29,0.501,0.444,0.407,0.384,0.374,0.0,,Real estate industry emerging from COVID-19 bu...,20.0,mp3mp4pdf


In [39]:
# Save to CSV
data_import.to_csv("files/data_w_usernames_CAN.csv")

In [40]:
# Import again for inspection
data_import_test = pd.read_csv("files/data_w_usernames_CAN.csv",index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [41]:
# Inspect first five elements
data_import_test.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content,username
0,1.22196e+18,13752140.0,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1.0,A,@maurerbot @JustinTrudeau There's already What...,28.0,StMinaHamilton
1,1.221971e+18,1.031691e+18,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0.0,,"""Tracking coronavirus: Map, data and timeline""...",16.0,WalkinRobinL
2,1.221976e+18,31473810.0,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1.0,F,@AliEhsassi we have a Canadian baby and her mo...,34.0,WillWalkerley
3,1.221985e+18,1072771.0,wuhan,2020-01-28 02:36:25,0.436,0.526,0.415,0.232,0.379,-1.0,F,3 are currently under observation in Quebec fo...,38.0,Road_Buster84
4,1.22199e+18,401098300.0,wuhan,2020-01-28 02:53:08,0.501,0.436,0.473,0.351,0.365,0.0,,@Franktmcveety Is this the official site for h...,40.0,burlingtonlaw


In [42]:
# Inspect last five elements
data_import_test.tail(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content,username
616792,1.285836e+18,282716700.0,covid,2020-07-22 07:14:08,0.438,0.486,0.438,0.201,0.372,-1.0,F,"""Covid-19 coronavirus: Is Trump's about-turn o...",15.0,2dCale
616793,1.285836e+18,1497175000.0,covid,2020-07-22 07:15:29,0.33,0.604,0.529,0.157,0.545,-1.0,F,@660NEWS People are still getting sick. The vi...,54.0,downtownalice
616794,1.285836e+18,620886000.0,covid,2020-07-22 07:15:38,0.485,0.502,0.419,0.307,0.426,0.0,,Over 60 cases of COVID-19 related to Kelowna c...,10.0,maiaswenson
616795,1.285836e+18,1.044971e+18,covid,2020-07-22 07:16:52,0.478,0.441,0.447,0.28,0.424,-1.0,A,COVID-19 caused the greatest damage to the glo...,31.0,Syncwithmoi
616796,1.285836e+18,2388168000.0,covid,2020-07-22 07:17:29,0.501,0.444,0.407,0.384,0.374,0.0,,Real estate industry emerging from COVID-19 bu...,20.0,mp3mp4pdf
