# Building UK Tweets Dataset (UK)

With the tweets scraped in `4A_scraping_usernames_UK.ipynb`, we now add the tweets to the DataFrame.

In [60]:
# Import dependancies
import pickle
import pandas as pd
import itertools
from tqdm import tqdm

In [61]:
# Import tweets from saved list .pickle
with open('pickles/users_UK.pkl', 'rb') as f:
    mynewlist = pickle.load(f)

In [62]:
# Since the list is a list of lists we combine it into a single list (Twitter API could only handle 100 requests/call)
final_list = list(itertools.chain.from_iterable(mynewlist))

# Inspect length
print(len(final_list))

256268


In [63]:
# Import missing tweet IDs from saved list .pickle
with open('pickles/errors_users_UK.pkl', 'rb') as f:
    errors = pickle.load(f)

In [64]:
# Make into single list
errors = list(itertools.chain.from_iterable(errors))

In [65]:
# Inspect length
print(len(errors))

630


In [66]:
## Check that first element of user ID list used to scrape users matches that of the DataFrame. ##

# Import list from .pickle
with open('pickles/list_user_ids_UK.pkl', 'rb') as f:
    user_ids = pickle.load(f)

# Make into single list
user_ids = list(itertools.chain.from_iterable(user_ids))

# Inspect first element
print(user_ids[0])

1191354362649600000


In [67]:
# Import data
data_import = pd.read_csv("files/cleaned_twitter_data_UK.csv",index_col=0, dtype = {'user_id': str})

  exec(code_obj, self.user_global_ns, self.user_ns)


In [68]:
# Inspect first five rows
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content
1,1234290271774543872,248819855,covid,2020-03-02 01:31:47,0.38,0.611,0.412,0.218,0.46,-1.0,F,HEALTH EMERGENCY / INTERNATIONAL COVID-19 CORO...,8.0
2,1234291397202776065,248819855,covid,2020-03-02 01:36:15,0.408,0.505,0.422,0.218,0.417,-1.0,F,HEALTH EMERGENCY / INTERNATIONAL COVID-19 CORO...,23.0
3,1234292673260879872,447663369,corona,2020-03-02 01:41:20,0.448,0.566,0.419,0.267,0.415,-1.0,F,Corruption scaring corona virus from Africa &g...,7.0
5,1234294077564375040,1903820431,corona,2020-03-02 01:46:54,0.49,0.529,0.447,0.426,0.475,0.0,,Starting up with a cold and lightly concerned ...,41.0
6,1234294853330182144,20973388,covid,2020-03-02 01:49:59,0.406,0.613,0.458,0.271,0.46,-1.0,F,"The UK government is considering all options, ...",27.0


In [69]:
# Remove rows that we did not find any tweets for
data_import = data_import[~data_import.user_id.isin(errors)]

# Inspect new shape
data_import.shape

(591779, 13)

In [70]:
# Inspect first five elements
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content
1,1234290271774543872,248819855,covid,2020-03-02 01:31:47,0.38,0.611,0.412,0.218,0.46,-1.0,F,HEALTH EMERGENCY / INTERNATIONAL COVID-19 CORO...,8.0
2,1234291397202776065,248819855,covid,2020-03-02 01:36:15,0.408,0.505,0.422,0.218,0.417,-1.0,F,HEALTH EMERGENCY / INTERNATIONAL COVID-19 CORO...,23.0
3,1234292673260879872,447663369,corona,2020-03-02 01:41:20,0.448,0.566,0.419,0.267,0.415,-1.0,F,Corruption scaring corona virus from Africa &g...,7.0
5,1234294077564375040,1903820431,corona,2020-03-02 01:46:54,0.49,0.529,0.447,0.426,0.475,0.0,,Starting up with a cold and lightly concerned ...,41.0
6,1234294853330182144,20973388,covid,2020-03-02 01:49:59,0.406,0.613,0.458,0.271,0.46,-1.0,F,"The UK government is considering all options, ...",27.0


In [71]:
# Inspect length
print(len(user_ids))

256898


In [72]:
# Remove users that we could not scrape username for
wo_error_user_ids = [x for x in user_ids if x not in errors]

In [73]:
# Inspect length
print(len(wo_error_user_ids))

256268


In [74]:
# Check that length matches
print(len(final_list))

256268


In [75]:
# Make dictionary with user IDs as keys and Twitter usernames as value
id_to_name = dict(zip(user_ids, final_list))

In [76]:
# Make new column with usernames
data_import['username'] = data_import['user_id'].map(id_to_name)

In [77]:
# Inspect first five rows
data_import.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content,username
1,1234290271774543872,248819855,covid,2020-03-02 01:31:47,0.38,0.611,0.412,0.218,0.46,-1.0,F,HEALTH EMERGENCY / INTERNATIONAL COVID-19 CORO...,8.0,Innovatorunit
2,1234291397202776065,248819855,covid,2020-03-02 01:36:15,0.408,0.505,0.422,0.218,0.417,-1.0,F,HEALTH EMERGENCY / INTERNATIONAL COVID-19 CORO...,23.0,Innovatorunit
3,1234292673260879872,447663369,corona,2020-03-02 01:41:20,0.448,0.566,0.419,0.267,0.415,-1.0,F,Corruption scaring corona virus from Africa &g...,7.0,Plugindave
5,1234294077564375040,1903820431,corona,2020-03-02 01:46:54,0.49,0.529,0.447,0.426,0.475,0.0,,Starting up with a cold and lightly concerned ...,41.0,AndyShawWT
6,1234294853330182144,20973388,covid,2020-03-02 01:49:59,0.406,0.613,0.458,0.271,0.46,-1.0,F,"The UK government is considering all options, ...",27.0,PatrickDoyle83


In [80]:
# Inspect tail
data_import.tail(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content,username
733329,1285835856650043393,334091760,covid,2020-07-22 07:15:32,0.587,0.377,0.331,0.363,0.324,1.0,H,VDMA highlights textile machinery role in Covi...,9.0,cptntommy
733330,1285835987424337921,75508547,corona,2020-07-22 07:16:03,0.25,0.507,0.608,0.141,0.508,-2.0,A,@sidudeja But why would you want anyone dead? ...,24.0,YLALawyers
733331,1285836106496385025,1096478949725614080,covid,2020-07-22 07:16:32,0.325,0.537,0.504,0.186,0.483,-1.0,F,@LondonPehwb @grantshapps Absolutely. However...,49.0,jennigbradshaw
733332,1285836204559290369,162753440,covid,2020-07-22 07:16:55,0.359,0.563,0.556,0.287,0.438,-1.0,F,So I realised this morning I feel like a #Chal...,35.0,StephenGulli
733333,1285836317042135040,421047121,covid,2020-07-22 07:17:22,0.388,0.519,0.55,0.238,0.46,-1.0,A,"Israelis protest against Netanyahu, gov't hand...",17.0,hfxtransit


In [81]:
# Save to CSV
data_import.to_csv("files/data_w_usernames_UK.csv")

In [84]:
# Import again for inspection
data_import_test = pd.read_csv("files/data_w_usernames_UK.csv",index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [85]:
# Inspect first five elements
data_import_test.head(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content,username
1,1234290271774543872,248819900.0,covid,2020-03-02 01:31:47,0.38,0.611,0.412,0.218,0.46,-1.0,F,HEALTH EMERGENCY / INTERNATIONAL COVID-19 CORO...,8.0,Innovatorunit
2,1234291397202776065,248819900.0,covid,2020-03-02 01:36:15,0.408,0.505,0.422,0.218,0.417,-1.0,F,HEALTH EMERGENCY / INTERNATIONAL COVID-19 CORO...,23.0,Innovatorunit
3,1234292673260879872,447663400.0,corona,2020-03-02 01:41:20,0.448,0.566,0.419,0.267,0.415,-1.0,F,Corruption scaring corona virus from Africa &g...,7.0,Plugindave
5,1234294077564375040,1903820000.0,corona,2020-03-02 01:46:54,0.49,0.529,0.447,0.426,0.475,0.0,,Starting up with a cold and lightly concerned ...,41.0,AndyShawWT
6,1234294853330182144,20973390.0,covid,2020-03-02 01:49:59,0.406,0.613,0.458,0.271,0.46,-1.0,F,"The UK government is considering all options, ...",27.0,PatrickDoyle83


In [86]:
# Inspect last five elements
data_import_test.tail(5)

Unnamed: 0,tweet_ID,user_id,keywords,tweet_timestamp,valence_intensity,fear_intensity,anger_intensity,happiness_intensity,sadness_intensity,sentiment,emotion,tweet,len_content,username
733329,1285835856650043393,334091800.0,covid,2020-07-22 07:15:32,0.587,0.377,0.331,0.363,0.324,1.0,H,VDMA highlights textile machinery role in Covi...,9.0,cptntommy
733330,1285835987424337921,75508550.0,corona,2020-07-22 07:16:03,0.25,0.507,0.608,0.141,0.508,-2.0,A,@sidudeja But why would you want anyone dead? ...,24.0,YLALawyers
733331,1285836106496385025,1.096479e+18,covid,2020-07-22 07:16:32,0.325,0.537,0.504,0.186,0.483,-1.0,F,@LondonPehwb @grantshapps Absolutely. However...,49.0,jennigbradshaw
733332,1285836204559290369,162753400.0,covid,2020-07-22 07:16:55,0.359,0.563,0.556,0.287,0.438,-1.0,F,So I realised this morning I feel like a #Chal...,35.0,StephenGulli
733333,1285836317042135040,421047100.0,covid,2020-07-22 07:17:22,0.388,0.519,0.55,0.238,0.46,-1.0,A,"Israelis protest against Netanyahu, gov't hand...",17.0,hfxtransit
