# **Twitter Scrapping**

In [1]:
!pip install -q snscrape==0.3.4

In [2]:
import os
import pandas as pd
import numpy as np
from datetime import date
import itertools
import snscrape.modules.twitter as sntwitter

#### **Creating a dataframe containing information of British Columbia cities**

In [3]:
bc_cities = pd.read_html('https://en.wikipedia.org/wiki/List_of_cities_in_British_Columbia')

BritishColumbia = pd.DataFrame(bc_cities[0])
BritishColumbia.drop(53,inplace=True)
BritishColumbia.head(3)

BritishColumbia['Name'].replace({'Vancouver[a]':'Vancouver','Victoria[b]':'Victoria'},inplace=True)

keyword = '(pizza OR covid)'

general_dict = {}

for idx,city in enumerate(BritishColumbia['Name']):
  txt = 'wellbeing near:"'+city+'" within:{}km'.format(np.sqrt(BritishColumbia['Area (km2)[5]'].iloc[idx]))
  general_dict[city] = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(txt).get_items(), 10))

In [None]:
keyword = '(pizza OR covid OR USA)'    ###### If we want to scrape multiple keywords#######
key_words_wellbeing = ['welfare','health','wellbeing','mental illness','mental health','physical health','physical illness','psychiatrist','health issues']
search_key = '('
for el in key_words_wellbeing:
    search_key += el+' OR '

search_key = search_key[:-4]+')'
search_key

In [4]:
keyword = '(pizza OR covid OR USA)'    ###### If we want to scrapp multiple keywords#######


import time
from datetime import timedelta
start_time = time.monotonic()

my_dict = {}
for idx,city in enumerate(BritishColumbia['Name']):
  tweets = []
  tdf = None
  for i,tweet in enumerate(sntwitter.TwitterSearchScraper('(wellbeing OR health) near:"'+city+'" within:{}km'.format(np.sqrt(BritishColumbia['Area (km2)[5]'].iloc[idx]))).get_items()) :
    if i > 10000 :
              break
    text = tweet.content
    pubdate = tweet.date
    tweets.append({
      "date":pubdate,
      "content":text,
    })
    my_dict[city]=tweets
  
for key in my_dict.keys():
  my_dict[key] = pd.DataFrame(my_dict[key])

end_time = time.monotonic()
print('Runtime is:',timedelta(seconds=end_time - start_time))

Runtime is: 0:25:09.169407


In [13]:
for key in my_dict.keys():
  my_dict[key] = pd.DataFrame(my_dict[key])
  
for k,v in my_dict.items():
  print(k)
  print(len(v))

Abbotsford
673
Burnaby
1001
Campbell River
242
Castlegar
337
Chilliwack
961
Colwood
1001
Coquitlam
1001
Courtenay
767
Cranbrook
169
Dawson Creek
52
Delta
1001
Duncan
89
Enderby
1001
Fernie
61
Fort St. John
266
Grand Forks
695
Greenwood
91
Kamloops
1001
Kelowna
1001
Kimberley
293
Langford
1001
Langley
161
Maple Ridge
1001
Merritt
46
Mission
1001
Nanaimo
610
Nelson
53
New Westminster
1001
North Vancouver
1001
Parksville
98
Penticton
752
Pitt Meadows
1001
Port Alberni
257
Port Coquitlam
1001
Port Moody
1001
Powell River
94
Prince George
1001
Prince Rupert
62
Quesnel
366
Revelstoke
99
Richmond
1001
Rossland
30
Salmon Arm
135
Surrey
1001
Terrace
64
Trail
72
Vancouver
1001
Vernon
888
Victoria
1001
West Kelowna
1001
White Rock
313
Williams Lake
567


In [14]:
# Creating a new dictionary based on tweets after 2019:

my_dict_covid = {}

for key,val in my_dict.items():  
  my_dict_covid[key] = val[val['date']>'2019-01-01']

In [73]:
### Creating the dataframe to perform NLP:

date = []
content = []
city = []

for k,v in my_dict_covid.items():
  for dat in my_dict_covid[k].date:
    date.append(dat)
  for cnt in my_dict_covid[k].content:
    content.append(cnt)
    city.append(k)

nlp_df = pd.DataFrame(list(zip(date,content,city)),columns=['Date','Content','City'])

## Preposessing NLP dataframe:

In [74]:
nlp_df['Content'] = nlp_df['Content'].str.lower()

nlp_df['Content'].duplicated().sum() #There is a good chance that we have duplicated rows:
nlp_df.drop_duplicates(subset ="Content" ,inplace=True)

In [75]:
key_words_seniors = ['elderl','senior','old','aged','aging']

# Number of scrapping results that contain key_words_seniors elements:
nlp_df['Content'].apply(lambda x: any(item in x for item in key_words_seniors)).sum()



871

In [76]:
nlp_df.loc[nlp_df['Content'].apply(lambda x: any(item in x for item in key_words_seniors))]['Content']

5        @jaymeekitch8 @ctvnews agree except it needs t...
6        hey guys! did you know water buffalo milk is e...
9        @arthister c’mon dr. art. lighten up a little....
23       @johnwrightlive well said!! the virus is no lo...
37       @nnulk there many. icbc policy changes, improv...
                               ...                        
22723    all i told him was that the vaccine 💉 passport...
22744    @ketaminh you borrowed it for safe keeping. 😉\...
22767    also just a random post.i feel like our suppor...
22772    i told my dad i was annoyed that i only heard ...
22777    @cbsnews scott is in the pocket of the health ...
Name: Content, Length: 871, dtype: object

In [None]:
key_words_seniors = ['elderl','senior','old','aged','aging']

# Processing 
def list_rm(lst,item):
  lst.remove(item)
  return lst

idx = []
for word in key_words_seniors:
  list_rm(key_words_seniors,word)
  for line,content in enumerate(nlp_df['Content']):
    if word in content:
      if content[content.index(word)-1] != ' ' and  not any(item in content for item in key_words_seniors):
        print(line)
        #print(word)
        #print(content)
        #nlp_df.drop(line , axis=0 , inplace=True)
        idx.append(line)
  key_words_seniors.append(word)

nlp_df.drop(nlp_df.index[list(set(idx))],inplace=True)

In [92]:
nlp_df.shape

(15142, 3)

In [43]:
key_words_seniors

key_words_wellbeing = ['welfare','mental illness','mental health','physical health','physical illness','psychiatrist','health issues']

['senior', 'aged', 'elderl', 'aging', 'old']

In [97]:
key_words_wellbeing = ['mental illness','mental health','physical health','physical illness','psychiatrist','health issues']
search_key = '('
for el in key_words_wellbeing:
    search_key += el+' OR '
search_key = search_key[:-4]+')'
search_key

'(mental illness OR mental health OR physical health OR physical illness OR psychiatrist OR health issues)'

In [None]:
try:
  from google.colab import drive
  drive.mount('drive')

  nlp_df.to_csv('NLP_Data.csv')
  !cp NLP_Data.csv "drive/My Drive/"
except:
  ("It's not google colab!")

# Twitters within Vancouver area

In [12]:
"""loc_centre = '54.15, -126.54, 10km' #Coordinates of the center of british columbia

loc = '49.246292, -123.116226, 60km'
df_coord = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(
    'wellbeing geocode:"{}"'.format(loc)).get_items(), 10000))[['date', 'content']]

# Extracting tweets after year 2019:
df_coord = df_coord[df_coord['date']>'2019-01-01']"""