# Twitter Data Extraction (Twitter API v2)
Monthly data extraction from Twitter API guided by the following [plan](https://docs.google.com/document/d/1d_4WeDetmZUkk9JJUEWiqqZaBaFsxy1ZDFNiL0JVaok/edit?usp=sharing) | [ISP Selection Guidelines](https://docs.google.com/document/d/12n9hZNdCLmrIVfK05MCa1CUhEYoCR9Ib0fPnxQZql_E/edit?usp=sharing) | [Twitter API Operators](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#limits)

In [19]:
#Tweepy v4 not yet merged to master as of 28/09/2021 – Install directly from Github
!pip install git+https://github.com/tweepy/tweepy.git

Collecting git+https://github.com/tweepy/tweepy.git
  Cloning https://github.com/tweepy/tweepy.git to /private/var/folders/j5/540q0bw12gx3g56qg4llbmlh0000gn/T/pip-req-build-jjxw37fx
  Running command git clone -q https://github.com/tweepy/tweepy.git /private/var/folders/j5/540q0bw12gx3g56qg4llbmlh0000gn/T/pip-req-build-jjxw37fx
  Resolved https://github.com/tweepy/tweepy.git to commit 277a739863f099be084f084d6f064712401d9579


In [27]:
!pip install tweepy==4.0.0



In [21]:
#Import relevant libraries
import tweepy
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta

In [33]:
import tweepy

In [34]:
tweepy.Client()

AttributeError: module 'tweepy' has no attribute 'Client'

In [32]:
!pip freeze

absl-py==0.11.0
alabaster==0.7.12
anaconda-client==1.7.2
anaconda-navigator==1.9.12
anaconda-project==0.8.3
applaunchservices==0.2.1
appnope==0.1.0
appscript==1.0.1
argh==0.26.2
arrow==1.1.1
asn1crypto==1.3.0
astroid==2.4.2
astropy==4.0
astunparse==1.6.3
atomicwrites==1.3.0
attrs==19.3.0
Automat==20.2.0
autopep8==1.4.4
Babel==2.8.0
backcall==0.1.0
backports.functools-lru-cache==1.6.1
backports.shutil-get-terminal-size==1.0.0
backports.tempfile==1.0
backports.weakref==1.0.post1
beautifulsoup4==4.8.2
binaryornot==0.4.4
bitarray==1.2.1
bkcharts==0.2
bleach==3.1.0
bokeh==1.4.0
boto==2.49.0
Bottleneck==1.3.2
Brotli==1.0.9
cachetools==4.2.0
certifi==2019.11.28
cffi==1.14.0
chardet==3.0.4
charset-normalizer==2.0.5
Click==7.0
cloudpickle==1.3.0
clyent==1.2.2
colorama==0.4.3
conda==4.9.2
conda-build==3.18.11
conda-package-handling==1.6.0
conda-verify==3.4.2
constantly==15.1.0
contextlib2==0.6.0.post1
cookiecutter==1.7.3
cryptography==2.8
csssele

In [30]:
!git clone https://github.com/tweepy/tweepy.git
!cd tweepy
!pip install .

Cloning into 'tweepy'...
remote: Enumerating objects: 11839, done.[K
remote: Counting objects: 100% (2282/2282), done.[K
remote: Compressing objects: 100% (722/722), done.[K
remote: Total 11839 (delta 1698), reused 2131 (delta 1551), pack-reused 9557[K
Receiving objects: 100% (11839/11839), 12.27 MiB | 1.88 MiB/s, done.
Resolving deltas: 100% (8299/8299), done.
[31mERROR: Directory '.' is not installable. Neither 'setup.py' nor 'pyproject.toml' found.[0m


In [6]:
#Import the twitter credentials stored in a separate file
#%run ../src/credentials/twitter_credentials
%run ../src/credentials/alt_twitter_credentials

#Create the authentication object
auth = tweepy.OAuthHandler(api_key,api_secret_key)

#Set the access token and access token secret
auth.set_access_token(access_token,access_token_secret)

#Create the API object
api = tweepy.API(auth)

In [11]:
tweepy.



In [10]:
tweepy.Client

AttributeError: module 'tweepy' has no attribute 'Client'

### 1. SETTING UP & CONNECTING TO THE API

In [3]:
def authenticate():
    
    """
    Function to handle API connection, setup and authentication
    """
    
    #Import the twitter credentials stored in a separate file
    %run ../src/credentials/alt_twitter_credentials
    
    #Create the authentication object
    auth = tweepy.OAuthHandler(api_key,api_secret_key)

    #Set the access token and access token secret
    auth.set_access_token(access_token,access_token_secret)

    #Create the API object
    api = tweepy.API(auth)  
    
    return api

### 2. SPECIFYING VARIABLES FOR THE DATA EXTRACTION

In [3]:
#Dev environment for the full archive endpoint (allows access to all tweets overtime)
dev_env = 'prod'

#THE BELOW SPECIFY HANDLES MANAGED BY THE ISPs or HANDLES THAT APPEAR TO TWEET BRAND PROMOTIONAL CONTENT
#TWEETS FROM THESE HANDLES WILL BE AVOIDED WHEN EXTRACTING
#Note: It is infeasible to address all cases. However, we would expect such tweets to be in the minority

#Spectranet ISP
spectranet_handles = ['-from:spectranet_NG','Spectr_net','SPECTRANETLTE','spectranet__NG']

#IPNX ISP
ipnx_handles = ['-from:ipNXTweet','IpnxSupport','iRecruite']

#Tizeti (Wifi.ng) ISP
tizeti_handles = ['-from:tizeti','wifisupport1']

#Cobranet ISP
cobranet_handles = ['-from:Cobranetisp']

### 2. Extracting the Tweets

#### Splitting the yearly quarters from which data will be extracted into subintervals

In [4]:
def date_range(start, end, intv):
    
    """
    Split Date Range into Roughly Equal Sub Intervals. Adapted from StackOverflow answer by Abhijit(2015)
    Retrieved from https://stackoverflow.com/questions/29721228
    
    Inputs
        - start (str): The start date of the time period
        - end (str): The end date of the time period
        - intv (int): Interval size (i.e. split the duration into roughly 'intv' equal subintervals)
        
    Outputs
        - Generator object containing the subinterval dates
    
    """
    
    #Convert start date to datetime object
    start = datetime.strptime(start,"%Y%m%d")
    
    #Convert end date to datetime object
    end = datetime.strptime(end,"%Y%m%d")
    
    #Find the roughly equal subinterval length
    diff = (end  - start ) / intv
    
    #Compute the subinterval dates and yield as string
    for i in range(intv):
        
        #After the first sub interval, start intervals from the day after the last interval's end day
        if i > 1:
            yield (start + diff * (i-1) + timedelta(1)).strftime("%Y%m%d")
        yield (start + diff * i).strftime("%Y%m%d")
        
    #Compute the last interval
    yield (start + diff * (intv-1) + timedelta(1)).strftime("%Y%m%d")
    yield end.strftime("%Y%m%d")

#### Specifying the query & pulling from the API

In [38]:
def ISP_Tweet_Extractor(api,isp_name, from_date, to_date):
    
    """
    Function to extract tweets for a specified ISP during a specified time frame
    
    Inputs:
     - isp_name (str): Name of the ISP to extract tweets for
     - from_date (str): Earliest date (and time) of posting for any extracted tweet
     - to_date (str): Latest date (and time) of posting for any extracted tweet 
     
    Output:
     - subintv_ISP_tweets (list): List containing API results for yearly quarter subintervals
     
    """
    
    #Connection to api
    api = api
    
    #### ------------------ VARIABLE REFORMATTING ------------------ ####
    #Assign ISP name to variable ensuring it is in lower case
    isp_name = isp_name.lower()
    
    #Reformat the fromDate to YYYYMMDD format
    from_date = from_date.replace('-','')
    
    #Reformat the fromDate to YYYYMMDD format
    to_date = to_date.replace('-','')
    
    #Split the quarter (from_date - to_date) to 5 equal subintervals
    #*100 tweets will be extracted from each subinterval
    intv_dates = [*date_range(from_date, to_date, 5)]
    
    #Get the subinterval date pairs
    date_pairs = [(intv_dates[idx],intv_dates[idx+1]) for idx in range(0,len(intv_dates),2)]
    
    
    #### ------------------ BUILDING THE API QUERY  ------------------ ####
    
    #Join the different handles to form the exclusion portion of the query
    excl_handles = ' -from:'.join(eval(isp_name +'_handles'))
    
    #Query for tweets in Lagos containing the ISP's name and exclude tweets 
    #from the official ISP Twitter handles
    
    #If the ISP is Tizeti, take into account that they are known by multiple names
    if isp_name == 'tizeti':
        
        api_query = f""" tizeti OR wifi.com.ng OR wifi.ng {excl_handles} place:"Lagos,Nigeria" """
    
    else:
        api_query = f"""{isp_name} {excl_handles} place:"Lagos,Nigeria" """
        
        
    #### ------------------ SEARCHING & EXTRACTING THE DATA ------------------ ####
    
    #List to store the subinterval API responses
    subintv_ISP_tweets = []
    
    #For each subinterval
    for start,end in date_pairs:
        
        #Add time to the dates to fit with Twitter API format, 
        start = start + '0000' #midnight
        end = end + '2359' #just before crossing into the next day
        
        #Trying running the query
        try:
            #Full archive search for ISP tweets
            ISP_tweets = api.search_full_archive(dev_env, api_query, fromDate = start, toDate= end)
        
        #If it fails, print the exception raised and the subinterval in question, but continue
        except Exception as e:
            
            print(e,'\n')
            print(f'Subinterval associated with error: [{start},{end}]')
            continue
            
        #Add the subinterval API response to the list
        subintv_ISP_tweets.append(ISP_tweets)
    
    
    return subintv_ISP_tweets


#### Storing the tweets in a pandas dataframe

In [31]:
def tweets_to_df(api_result_list):
    
    """
    Function to extract relevant properties from api results (tweets objects) and store 
    in a pandas dataframe
    
    Input(s):
        - api_result_list (list): List containing API results for a yearly quarter's subintervals
        
    Output(s):
        - main_df (DataFrame): Pandas DataFrame of tweets (and their properties) from the yearly quarter
    
    """
    
    #Empty dataframe to compile data from all yearly quarter subintervals
    main_df = pd.DataFrame()
    
    #Iterate through all the subinterval api results
    for api_result in api_result_list:
        
        #Getting the relevant properties from the tweets and storing in a dictionary 
        tweets = [{'Time':tweet.created_at, 'Text':tweet.text, 'Coordinates':tweet.coordinates, 
                   'Place': tweet.place, 'Source':tweet.source} for tweet in api_result]
    
        #Convert the dictionary to a pandas dataframe
        df = pd.DataFrame.from_dict(tweets)
        
        #Append the pandas df for the API result to the main df
        main_df = main_df.append(df)
        
    
    return main_df

#### Converting pandas df to csv file

In [32]:
def df_to_csv(df,isp_name,from_date,yearly_quarter):
    
    #Alphanumerics to lowercase
    isp_name = isp_name.lower()
    quarter = yearly_quarter.lower()
    
    #Extract year from date
    year = from_date[:4]
    
    #Convert to CSV to save current tweets obtained from the API
    df.to_csv(f"../data/raw/{isp_name}/{isp_name}_tweets_{quarter}_{year}.csv", index= False)

In [None]:
def main(isp_name=None, from_date=None, to_date=None, yearly_quarter=None, interactive=False):
    
    if interactive:
        #Pass in parameters neeeded for API query
        isp_name = input('ISP Full Name:')
        from_date = input('Start Date (YYYY-MM-DD):')
        to_date = input('End Date (YYYY-MM-DD):')
        yearly_quarter = input('What quarter of the year? (q_):')
        
    else:
        if any(x is None for x in [isp_name,from_date,to_date,yearly_quarter]):
            
            raise ValueError('Please ensure a valid value is passed for all the parameters')
    
    #Connect and authenticate Twitter API
    api = authenticate()
    
    #Pull the data from the API using the query and parameters
    api_results = ISP_Tweet_Extractor(api, isp_name, from_date, to_date)
    
    #Convert the API results into a pandas dataframe
    ISP_tweets = tweets_to_df(api_results)
    
    #Write to csv file
    df_to_csv(ISP_tweets,isp_name,from_date,yearly_quarter)

quarterly_dates_2019 = [('2019-01-01','2019-03-31','q1'),('2019-04-01','2019-06-30','q2'),
                        ('2019-07-01','2019-09-30','q3'),('2019-10-01','2019-12-31','q4')]

quarterly_dates_2020 = [('2020-01-01','2020-03-31','q1'),('2020-04-01','2020-06-30','q2'),
                        ('2020-07-01','2020-09-30','q3'),('2020-10-01','2020-12-31','q4')]

for start, end, quarter in quarterly_dates_2019:
    
    if __name__ == "__main__":
        main('ipnx',start,end,quarter)
    time.sleep(1)


In [37]:
if __name__ == "__main__":
    main(interactive=True)

ISP Full Name:tizeti
Start Date (YYYY-MM-DD):2019-04-01
End Date (YYYY-MM-DD):2019-06-30
What quarter of the year? (q_):q2
{'message': "There were errors processing your request: Reference to invalid operator 'is:retweet'. Operator is not available in current product or product packaging. Please refer to complete available operator list at http://t.co/operators. (at position 70)", 'sent': '2021-09-28T09:21:42+00:00', 'transactionId': '9bbfe9465a88f62f'} 

Subinterval associated with error: [201904010000,201904192359]
{'message': "There were errors processing your request: Reference to invalid operator 'is:retweet'. Operator is not available in current product or product packaging. Please refer to complete available operator list at http://t.co/operators. (at position 70)", 'sent': '2021-09-28T09:21:42+00:00', 'transactionId': '798fb2a340ac3f3f'} 

Subinterval associated with error: [201904200000,201905072359]
{'message': "There were errors processing your request: Reference to invalid 

In [39]:
k = pd.read_csv('../data/raw/spectranet/spectranet_tweets_q1_2019.csv')

In [40]:
k

Unnamed: 0,Time,Text,Coordinates,Place,Source
0,2019-01-18 22:22:16,@UMEHoma @Spectranet_NG @CPCNig @SANNigeria @F...,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone
1,2019-01-18 08:53:02,After making me use my last 10k to renew my ac...,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone
2,2019-01-17 23:09:32,@Spectranet_NG I can see the bonus data on the...,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone
3,2019-01-17 23:05:19,@Spectranet_NG Renewed and didn't get the bonu...,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone
4,2019-01-17 16:36:17,This Spectranet is so shit I’m gonna so regret...,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone
...,...,...,...,...,...
102,2019-03-23 14:31:00,"@commando_skiipz He might be using spectranet,...",,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone
103,2019-03-21 05:06:54,😂😂😂I used to do that. But I can't kill myself....,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for Android
104,2019-03-20 11:37:01,@Spectranet_NG kilo sele gan gan? 24500 naira ...,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone
105,2019-03-15 21:29:06,@dejokecarew I don’t think spectranet is trash .,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone


In [41]:
for time,tweet in k[['Time','Text']].values:
    print(time,':',tweet,'\n')

2019-01-18 22:22:16 : @UMEHoma @Spectranet_NG @CPCNig @SANNigeria @FCC And those their representatives will be calling somebody using sweet voice 🤦‍♂️🤦‍♂️ 

2019-01-18 08:53:02 : After making me use my last 10k to renew my account on their network, there's still no @Spectranet_NG service in my… https://t.co/OsAvwqmfBO 

2019-01-17 23:09:32 : @Spectranet_NG I can see the bonus data on the web but not on the mobile app. Why? 

2019-01-17 23:05:19 : @Spectranet_NG Renewed and didn't get the bonus???? 

2019-01-17 16:36:17 : This Spectranet is so shit I’m gonna so regret buying. Very shitty service and I thought Smile was worse 🤦🏽‍♂️ 

2019-01-17 15:20:37 : @Spectranet_NG cydm 

2019-01-17 13:18:28 : Hey guys, Swift 4g vs Spectranet. Which would you rather please? 

2019-01-14 11:43:00 : @Spectranet_NG Hi, A spectranet Engineer came to my apartment on sat with his laptop and device. He confirmed bad s… https://t.co/6hbeKlScNO 

2019-01-13 15:57:15 : @Spectranet_NG #dontusespectranet https:

In [30]:
k.Time.sort_values().unique()

array(['2019-01-01 16:01:23', '2019-01-01 16:44:02',
       '2019-01-01 16:45:01', '2019-01-01 16:45:56',
       '2019-01-01 16:50:43', '2019-01-02 08:43:10',
       '2019-01-04 14:44:03', '2019-01-04 16:58:01',
       '2019-01-05 07:50:32', '2019-01-05 17:37:37',
       '2019-01-06 10:30:37', '2019-01-06 10:50:04',
       '2019-01-08 11:33:26', '2019-01-08 20:17:00',
       '2019-01-09 21:13:45', '2019-01-10 16:13:29',
       '2019-01-11 07:59:24', '2019-01-11 10:33:44',
       '2019-01-11 11:17:34', '2019-01-13 07:37:46',
       '2019-01-13 15:57:15', '2019-01-14 11:43:00',
       '2019-01-17 13:18:28', '2019-01-17 15:20:37',
       '2019-01-17 16:36:17', '2019-01-17 23:05:19',
       '2019-01-17 23:09:32', '2019-01-18 08:53:02',
       '2019-01-18 22:22:16', '2019-01-19 13:13:44',
       '2019-01-19 13:40:54', '2019-01-19 22:58:24',
       '2019-01-20 03:13:05', '2019-01-20 10:42:10',
       '2019-01-20 18:40:07', '2019-01-20 20:36:12',
       '2019-01-21 20:47:37', '2019-01-22 21:4

In [113]:
#Only two tweets with tagged coordinates 
k[k.Coordinates.isna() == False]

Unnamed: 0,Time,Subject,Text,Coordinates,Place,Source
24,2019-03-05 11:57:20,spectranet,Coming soon 🙌🙌\nSpectranet advertisement video...,"{'type': 'Point', 'coordinates': [3.39583, 6.4...",Place(_api=<tweepy.api.API object at 0x7fb97f4...,Instagram
48,2019-02-18 14:14:38,spectranet,#TuneIn if you be @Spectranet_NG user #tunein...,"{'type': 'Point', 'coordinates': [3.39583, 6.4...",Place(_api=<tweepy.api.API object at 0x7fb97f4...,Instagram


In [114]:
#The two tweets report being from the same location
k[k.Coordinates.isna() == False].iloc[0,3]  == k[k.Coordinates.isna() == False].iloc[1,3] 

True

Unfortunately I might have to expand and look at ISPs across the entire Lagos :(

In [51]:
from datetime import datetime

for a,b in date_pairs:
    print(datetime.strptime(b,"%Y%m%d") - datetime.strptime(a,"%Y%m%d"))

14 days, 0:00:00
14 days, 0:00:00
14 days, 0:00:00
14 days, 0:00:00
14 days, 0:00:00
14 days, 0:00:00


In [48]:
for idx in range(0,len(k),2):
    print(idx)

0
2
4
6
8
10


In [None]:
['20190101', '20190118', '20190205', '20190222', '20190312', '20190330']

In [None]:
['20190101','20190118']
['20190119','20190205']
['20190206','20190222']
['20190223','20190312']
['20190313','20190330']

In [10]:
from datetime import datetime
datetime.strptime('20190331',"%Y%m%d") - datetime.strptime('20190314',"%Y%m%d")

TypeError: unsupported operand type(s) for +: 'datetime.datetime' and 'int'

In [43]:
import pandas as pd
import datetime as dt

begin = '20100101'
end = '2010331'

start = dt.datetime.strptime(begin, '%Y%m%d')
finish = dt.datetime.strptime(end, '%Y%m%d')

dates = pd.DatetimeIndex(start=start, end=finish, freq='D').tolist()
quarters = [d.to_period('Q') for d in dates]
df = pd.DataFrame([quarters, dates], index=['Quarter', 'Date']).T

quarterly_dates = {str(q): [ts.strftime('%Y%m%d') 
                            for ts in df[df.Quarter == q].Date.values.tolist()]
                           for q in quarters}

TypeError: __new__() got an unexpected keyword argument 'start'

for tweet in tweets:
    if tweet.truncated:
        print(tweet.extended_tweet['full_text'])
    else:
        print(tweet.text)

https://github.com/tweepy/tweepy/issues/1461