# Building UK Tweets Dataset (CAN)

With the tweets scraped in `2B_scraping_tweets_CAN.ipynb`, we now add the tweets to the DataFrame.

In [26]:
# Import dependancies
import pickle
import pandas as pd
import itertools
from tqdm import tqdm

In [27]:
# Import tweets from saved list .pickle
with open('pickles/tweets_CAN.pkl', 'rb') as f:
    mynewlist = pickle.load(f)

In [28]:
# Since the list is a list of lists we combine it into a single list (Twitter API could only handle 100 requests/call)
final_list = list(itertools.chain.from_iterable(mynewlist))

# Inspect length
print(len(final_list))

616797


In [29]:
# Import missing tweet IDs from saved list .pickle
with open('pickles/errors_CAN.pkl', 'rb') as f:
    errors = pickle.load(f)

In [30]:
# Make into single list
errors = list(itertools.chain.from_iterable(errors))

In [31]:
# Convert to int
errors = [int(x) for x in errors]

In [32]:
# Inspect length
print(len(errors))

134139


In [33]:
## Check that first element of Tweet ID list used to scrape tweets matches that of the DataFrame. ##

# Import list from .pickle
with open('pickles/list_tweet_ids_CAN.pkl', 'rb') as f:
    tweet_ids = pickle.load(f)

# Make into single list
tweet_ids = list(itertools.chain.from_iterable(tweet_ids))

# Inspect first element
print(tweet_ids[0])

1221959892619218944


In [34]:
# Import data
data_import = pd.read_csv("files/project_tweet_data_final_CAN.csv",index_col=0)

In [35]:
# Inspect first five rows
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion
0,1221959892619218944,13752142,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1,A
1,1221970757636354048,1031691314536935424,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0,
2,1221975672891105281,31473811,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1,F
3,1221977054750617601,2654356639,wuhan,2020-01-28 02:03:27,0.456,0.576,0.464,0.308,0.44,-1,F
4,1221983093323493376,3410276675,wuhan,2020-01-28 02:27:27,0.429,0.53,0.446,0.258,0.414,-1,F


In [36]:
# Remove rows that we did not find any tweets for
data_import = data_import[~data_import.tweet_ID.isin(errors)]

# Inspect new shape
data_import.shape

(616797, 11)

In [37]:
# Inspect first five elements
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion
0,1221959892619218944,13752142,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1,A
1,1221970757636354048,1031691314536935424,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0,
2,1221975672891105281,31473811,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1,F
5,1221985348076687360,1072771,wuhan,2020-01-28 02:36:25,0.436,0.526,0.415,0.232,0.379,-1,F
6,1221989558696456194,401098282,wuhan,2020-01-28 02:53:08,0.501,0.436,0.473,0.351,0.365,0,


In [38]:
# Make new column for tweets
data_import['tweet'] = final_list

In [39]:
# Inspect first five rows
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet
0,1221959892619218944,13752142,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1,A,@maurerbot @JustinTrudeau There's already What...
1,1221970757636354048,1031691314536935424,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0,,"""Tracking coronavirus: Map, data and timeline""..."
2,1221975672891105281,31473811,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1,F,@AliEhsassi we have a Canadian baby and her mo...
5,1221985348076687360,1072771,wuhan,2020-01-28 02:36:25,0.436,0.526,0.415,0.232,0.379,-1,F,3 are currently under observation in Quebec fo...
6,1221989558696456194,401098282,wuhan,2020-01-28 02:53:08,0.501,0.436,0.473,0.351,0.365,0,,@Franktmcveety Is this the official site for h...


In [40]:
# Inspect last five rows
data_import.tail(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet
750931,1285835503934169088,282716700,covid,2020-07-22 07:14:08,0.438,0.486,0.438,0.201,0.372,-1,F,"""Covid-19 coronavirus: Is Trump's about-turn o..."
750932,1285835843911839745,1497174692,covid,2020-07-22 07:15:29,0.33,0.604,0.529,0.157,0.545,-1,F,@660NEWS People are still getting sick. The vi...
750933,1285835880784158720,620886049,covid,2020-07-22 07:15:38,0.485,0.502,0.419,0.307,0.426,0,,Over 60 cases of COVID-19 related to Kelowna c...
750934,1285836193880592384,1044971320998588417,covid,2020-07-22 07:16:52,0.478,0.441,0.447,0.28,0.424,-1,A,COVID-19 caused the greatest damage to the glo...
750935,1285836349170491393,2388168248,covid,2020-07-22 07:17:29,0.501,0.444,0.407,0.384,0.374,0,,Real estate industry emerging from COVID-19 bu...


In [41]:
# Add new attribute containing length of tweet
data_import['len_content'] = data_import['tweet'].str.split().str.len()

In [42]:
# Inspect first five rows
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content
0,1221959892619218944,13752142,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1,A,@maurerbot @JustinTrudeau There's already What...,28
1,1221970757636354048,1031691314536935424,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0,,"""Tracking coronavirus: Map, data and timeline""...",16
2,1221975672891105281,31473811,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1,F,@AliEhsassi we have a Canadian baby and her mo...,34
5,1221985348076687360,1072771,wuhan,2020-01-28 02:36:25,0.436,0.526,0.415,0.232,0.379,-1,F,3 are currently under observation in Quebec fo...,38
6,1221989558696456194,401098282,wuhan,2020-01-28 02:53:08,0.501,0.436,0.473,0.351,0.365,0,,@Franktmcveety Is this the official site for h...,40


In [43]:
# Reset indices
data_import = data_import.reset_index(drop=True)

# Inspect last five rows
data_import.tail(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content
616792,1285835503934169088,282716700,covid,2020-07-22 07:14:08,0.438,0.486,0.438,0.201,0.372,-1,F,"""Covid-19 coronavirus: Is Trump's about-turn o...",15
616793,1285835843911839745,1497174692,covid,2020-07-22 07:15:29,0.33,0.604,0.529,0.157,0.545,-1,F,@660NEWS People are still getting sick. The vi...,54
616794,1285835880784158720,620886049,covid,2020-07-22 07:15:38,0.485,0.502,0.419,0.307,0.426,0,,Over 60 cases of COVID-19 related to Kelowna c...,10
616795,1285836193880592384,1044971320998588417,covid,2020-07-22 07:16:52,0.478,0.441,0.447,0.28,0.424,-1,A,COVID-19 caused the greatest damage to the glo...,31
616796,1285836349170491393,2388168248,covid,2020-07-22 07:17:29,0.501,0.444,0.407,0.384,0.374,0,,Real estate industry emerging from COVID-19 bu...,20


In [44]:
# Save to CSV
data_import.to_csv("files/cleaned_twitter_data_CAN.csv")

In [45]:
# Import again for inspection
data_import_test = pd.read_csv("files/cleaned_twitter_data_CAN.csv",index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [46]:
# Inspect first five elements
data_import_test.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content
0,1.22196e+18,13752140.0,wuhan,2020-01-28 00:55:15,0.409,0.468,0.53,0.255,0.462,-1.0,A,@maurerbot @JustinTrudeau There's already What...,28.0
1,1.221971e+18,1.031691e+18,wuhan,2020-01-28 01:38:26,0.501,0.473,0.366,0.294,0.367,0.0,,"""Tracking coronavirus: Map, data and timeline""...",16.0
2,1.221976e+18,31473810.0,wuhan,2020-01-28 01:57:58,0.451,0.453,0.435,0.336,0.451,-1.0,F,@AliEhsassi we have a Canadian baby and her mo...,34.0
3,1.221985e+18,1072771.0,wuhan,2020-01-28 02:36:25,0.436,0.526,0.415,0.232,0.379,-1.0,F,3 are currently under observation in Quebec fo...,38.0
4,1.22199e+18,401098300.0,wuhan,2020-01-28 02:53:08,0.501,0.436,0.473,0.351,0.365,0.0,,@Franktmcveety Is this the official site for h...,40.0


In [47]:
# Inspect last five elements
data_import_test.tail(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content
616792,1.285836e+18,282716700.0,covid,2020-07-22 07:14:08,0.438,0.486,0.438,0.201,0.372,-1.0,F,"""Covid-19 coronavirus: Is Trump's about-turn o...",15.0
616793,1.285836e+18,1497175000.0,covid,2020-07-22 07:15:29,0.33,0.604,0.529,0.157,0.545,-1.0,F,@660NEWS People are still getting sick. The vi...,54.0
616794,1.285836e+18,620886000.0,covid,2020-07-22 07:15:38,0.485,0.502,0.419,0.307,0.426,0.0,,Over 60 cases of COVID-19 related to Kelowna c...,10.0
616795,1.285836e+18,1.044971e+18,covid,2020-07-22 07:16:52,0.478,0.441,0.447,0.28,0.424,-1.0,A,COVID-19 caused the greatest damage to the glo...,31.0
616796,1.285836e+18,2388168000.0,covid,2020-07-22 07:17:29,0.501,0.444,0.407,0.384,0.374,0.0,,Real estate industry emerging from COVID-19 bu...,20.0
