# Data Collection

## Install required packages for data collection

In [None]:
# install Twitter and Tweepy scrapping package
!pip install --upgrade pip
!pip install twitterscraper
!pip install tweepy

## Install required libraries

In [None]:
# import libraries for scrapping and reading dataframe

#!/usr/bin/env python
from __future__ import print_function

#Import the necessary methods from tweepy library
import tweepy
from tweepy import OAuthHandler

from twitterscraper import query_tweets
import datetime as dt
import pandas as pd

## Set the requirements and keywords to webscrapping 

In [None]:
# set begin date for Twitter scraping
begin_date = dt.date(2020,3,10)

# set end date for Twitter scraping
end_date = dt.date(2020,7,25)

# set language for Twitter scrapping
lang = 'english'

# sets of combined keywords for covid and elearning keywords
covid_elearning_keywords = "covid AND e-learning"
covid_online_keywords = "covid AND online learning"
corona_elearning_keywords = "corona AND e-learning"
corona_online_keywords = "corona AND online learning"
covid_distance_keywords = "covid AND distance learning"
corona_distance_keywords = "corona AND distance learning"

## Scrapping Process

In [None]:
%time
covid_africa_tweets = query_tweets(covid_africa_keywords, begindate = begin_date, enddate = end_date, limit = None, lang = lang)


In [None]:
# scrape data from Twitter 
%time
covid_elearning_tweets = query_tweets(covid_elearning_keywords, begindate = begin_date, enddate = end_date, limit = None, lang = lang)


In [None]:
%time
covid_online_tweets = query_tweets(covid_online_keywords, begindate = begin_date, enddate = end_date, limit = None, lang = lang)

In [None]:
%time
corona_elearning_tweets = query_tweets(corona_elearning_keywords, begindate = begin_date, enddate = end_date, limit = None, lang = lang)

In [None]:
%time
corona_online_tweets = query_tweets(corona_online_keywords, begindate = begin_date, enddate = end_date, limit = None, lang = lang)

In [None]:
%time
covid_distance_tweets = query_tweets(covid_distance_keywords, begindate = begin_date, enddate = end_date, limit = None, lang = lang)

In [None]:
%time
corona_distance_tweets = query_tweets(corona_distance_keywords, begindate = begin_date, enddate = end_date, limit = None, lang = lang)

## Save scrapped data to dataframe

In [None]:
covid_elearning_df = pd.DataFrame(t.__dict__ for t in covid_elearning_tweets)

covid_online_df = pd.DataFrame(t.__dict__ for t in covid_online_tweets)

corona_elearning_df = pd.DataFrame(t.__dict__ for t in corona_elearning_tweets)

corona_online_df = pd.DataFrame(t.__dict__ for t in corona_online_tweets)

covid_distance_df = pd.DataFrame(t.__dict__ for t in covid_distance_tweets)

corona_distance_df = pd.DataFrame(t.__dict__ for t in corona_distance_tweets)



## Review Scrapped Data

In [None]:
covid_elearning_df.head()

In [None]:
covid_elearning_df.info()
covid_elearning_df.describe()

In [None]:
covid_online_df.head()

In [None]:
covid_online_df.info()
covid_online_df.describe()

In [None]:
corona_elearning_df.head()

In [None]:
corona_elearning_df.info()
corona_elearning_df.describe()

In [None]:
corona_online_df.head()

In [None]:
corona_online_df.info()
corona_online_df.describe()

In [None]:
covid_distance_df.head()

In [None]:
covid_distance_df.info()
covid_distance_df.describe()

In [None]:
corona_distance_df.head()

In [None]:
corona_distance_df.info()
corona_distance_df.describe()

In [None]:
# save data in pickle
covid_elearning_df.to_pickle("covid_elearning_tweets.pkl")
covid_online_df.to_pickle("covid_online_tweets.pkl")
corona_elearning_df.to_pickle("corona_elearning_tweets.pkl")
corona_online_df.to_pickle("corona_online_tweets.pkl")
covid_distance_df.to_pickle("covid_distance_tweets.pkl")
corona_distance_df.to_pickle("corona_distance_tweets.pkl")

In [None]:
# load all pickle files
df_corona_distance_tweets = pd.read_pickle(r"corona_distance_tweets.pkl")
df_corona_elearning_tweets = pd.read_pickle(r"corona_elearning_tweets.pkl")
df_corona_online_tweets = pd.read_pickle(r"corona_online_tweets.pkl")
df_covid_distance_tweets = pd.read_pickle(r"covid_distance_tweets.pkl")
df_covid_elearning_tweets = pd.read_pickle(r"\covid_elearning_tweets.pkl")
df_covid_online_tweets = pd.read_pickle(r"covid_online_tweets.pkl")

In [None]:
# merge all dataframes
df_list = [df_corona_distance_tweets, df_corona_elearning_tweets, df_corona_online_tweets, df_covid_distance_tweets,\
                df_covid_elearning_tweets, df_covid_online_tweets]
df = pd.concat(df_list)

In [None]:
df.drop_duplicates(subset="text", keep='first', inplace=True)

In [None]:
df.to_excel(r"tweets.xlsx")

In [None]:
# Create list for screen name to extract location
screenname = df['screen_name'].reset_index()

In [None]:
# Drop duplicate screen name
screenname.drop_duplicates(subset='screen_name',keep='first', inplace=True)

In [None]:
screenname.drop('index', axis=1)

In [None]:
name = screenname['screen_name']

name_list = name.values.tolist()

## Scrap Location for tweets data

In [None]:
# scrape location for tweets

location = []
#user credentials to access Twitter API 
access_token = "xxxxxxxxxx"
access_token_secret = "xxxxxxxxxx"
consumer_key = "xxxxxxxxxx"
consumer_secret = "xxxxxxxxxx"

def get_user_details(username):
        userobj = api.get_user(username)
        return userobj


if __name__ == '__main__':
    #authenticating the app (https://apps.twitter.com/)
    auth = tweepy.auth.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    for name in name_list:
        try:
            userOBJ = get_user_details(name)
            location.append(userOBJ.location)
        except tweepy.TweepError:
            location.append('')
            continue

In [None]:
# combline list of location and screen name
combine = list(zip(name_list,location))

In [None]:
name_location = pd.DataFrame (combine,columns=['screen_name','location'])

In [None]:
name_location.to_excel("location.xlsx") 