# Twitter Data Extraction
Monthly data extraction from Twitter API guided by the following [plan](https://docs.google.com/document/d/1d_4WeDetmZUkk9JJUEWiqqZaBaFsxy1ZDFNiL0JVaok/edit?usp=sharing) | [ISP Selection Guidelines](https://docs.google.com/document/d/12n9hZNdCLmrIVfK05MCa1CUhEYoCR9Ib0fPnxQZql_E/edit?usp=sharing) | [Twitter API Operators](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#limits) | 
Last updated Sept 14th, 2021

In [86]:
#Import relevant libraries
import tweepy
import pandas as pd
import numpy as np

### 1. SETTING UP & CONNECTING TO THE API

In [87]:
def authenticate():
    
    """
    Function to handle API connection, setup and authentication
    """
    
    #Import the twitter credentials stored in a separate file
    %run ./twitter_credentials
    
    #Create the authentication object
    auth = tweepy.OAuthHandler(api_key,api_secret_key)

    #Set the access token and access token secret
    auth.set_access_token(access_token,access_token_secret)

    #Create the API object
    api = tweepy.API(auth)  
    
    return api

### 2. SPECIFYING VARIABLES FOR THE DATA EXTRACTION

In [88]:
#Dev environment for the full archive endpoint (allows access to all tweets overtime)
dev_env = 'prod'

#THE BELOW SPECIFY HANDLES MANAGED BY THE ISPs or HANDLES THAT APPEAR TO TWEET BRAND PROMOTIONAL CONTENT
#TWEETS FROM THESE HANDLES WILL BE AVOIDED WHEN EXTRACTING
#Note: It is infeasible to address all cases. However, we would expect such tweets to be in the minority

#Spectranet ISP
spectranet_handles = ['-from:spectranet_NG','Spectr_net','SPECTRANETLTE','spectranet__NG']

#IPNX ISP
ipnx_handles = ['-from:ipNXTweet','IpnxSupport','iRecruite']

#Tizeti (Wifi.ng) ISP
tizeti_handles = ['-from:tizeti','wifisupport1']

#Cobranet ISP
cobranet_handles = ['-from:Cobranetisp']

### 2. Extracting the Tweets
#### Specifying the query & pulling from the API

In [89]:
def ISP_Tweet_Extractor(api,isp_name, from_date, to_date):
    
    """
    Function to extract tweets for a specified ISP during a specified time frame
    
    Inputs:
     - isp_name (str): Name of the ISP to extract tweets for
     - from_date (str): Earliest date (and time) of posting for any extracted tweet
     - to_date (str): Latest date (and time) of posting for any extracted tweet 
     
    Output:
     - 
     
    """
    
    #Connection to api
    api = api
    
    #### ------------------ VARIABLE REFORMATTING ------------------ ####
    #Assign ISP name to variable ensuring it is in lower case
    isp_name = isp_name.lower()
    
    #Reformat the fromDate to accepted API format (YYYYMMDDHHmm)
    from_date = from_date.replace('-','').replace(':','')
    
    #Reformat the fromDate to accepted API format (YYYYMMDDHHmm)
    to_date = to_date.replace('-','').replace(':','')
    
    #### ------------------ BUILDING THE API QUERY  ------------------ ####
    
    #Join the different handles to form the exclusion portion of the query
    excl_handles = ' -from:'.join(eval(isp_name +'_handles'))
    
    #Query for tweets in Lagos containing the ISP's name and exclude tweets 
    #from the official ISP Twitter handles
    
    #If the ISP is Tizeti, take into account that they are known by multiple names
    if isp_name == 'tizeti':
        
        api_query = f""" tizeti OR wifi.com.ng OR wifi.ng {excl_handles} 
                        -is:retweet point_radius:[5.53 3.54 140km]"""
    else:
        
        api_query = f"""{isp_name} {excl_handles} place:"Lagos,Nigeria" """
        
        
    #### ------------------ SEARCHING & EXTRACTING THE DATA ------------------ ####

    #Full archive search for ISP tweets
    ISP_tweets = api.search_full_archive(dev_env, api_query, fromDate = from_date, toDate= to_date)
    
    
    return ISP_tweets


f"{isp_name} {excl_handles} -is:retweet geocode:'5.53,3.54,140km'"

f""" tizeti OR wifi.com.ng OR wifi.ng {excl_handles} 
                    -is:retweet geocode:'5.53,3.54,140km'"""
                    
                    
point_radius:[5.53 3.54 140km]


#### Storing the tweets in a pandas dataframe

In [91]:
def tweets_to_df(api_result):
    
    """
    Function to extract relevant properties from an api result (tweets objects) and store 
    in a pandas dataframe
    """
    
    #Getting the relevant properties from the tweets and storing in a dictionary 
    tweets = [{'Time':tweet.created_at, 'Subject':isp_name, 'Text':tweet.text,
              'Coordinates':tweet.coordinates, 'Place': tweet.place, 'Source':tweet.source
              } for tweet in api_result]
    
    #Convert the dictionary to a pandas dataframe
    df = pd.DataFrame.from_dict(tweets)
    
    return df

#### Converting pandas df to csv file

In [92]:
def df_to_csv(df,isp_name,from_date,yearly_quarter):
    
    #Alphanumerics to lowercase
    isp_name = isp_name.lower()
    quarter = yearly_quarter.lower()
    
    #Extract year from date
    year = from_date[:4]
    
    #Convert to CSV to save current tweets obtained from the API
    df.to_csv(f"./data/{isp_name}_tweets_{quarter}_{year}.csv", index= False)

In [93]:
def main():
    
    #Connect and authenticate Twitter API
    api = authenticate()
    
    #Pass in parameters neeeded for API query
    isp_name = input('ISP Full Name:')
    from_date = input('Start Date (YYYY-MM-DD-HH:mm):')
    to_date = input('End Date (YYYY-MM-DD-HH:mm):')
    yearly_quarter = input('What quarter of the year? (q_):')
    
    #Pull the data from the API using the query and parameters
    api_results = ISP_Tweet_Extractor(api, isp_name, from_date, to_date)
    
    #Convert the API results into a pandas dataframe
    ISP_tweets = tweets_to_df(api_results)
    
    #Write to csv file
    df_to_csv(ISP_tweets,isp_name,from_date,yearly_quarter)

In [94]:
if __name__ == "__main__":
    main()

ISP Full Name:spectranet
Start Date (YYYY-MM-DD-HH:mm):2019-01-01-00:00
End Date (YYYY-MM-DD-HH:mm):2019-03-31-23:59
What quarter of the year? (q_):q1


In [111]:
k = pd.read_csv('./data/spectranet_tweets_q1_2019.csv')

In [112]:
k

Unnamed: 0,Time,Subject,Text,Coordinates,Place,Source
0,2019-03-31 19:13:28,spectranet,I didn't once check my spectranet data balance...,,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for Android
1,2019-03-29 11:29:59,spectranet,"Eriq, i assume?\nFirst of all accept my humble...",,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for iPhone
2,2019-03-29 07:46:55,spectranet,"Bought Spectranet mifi on tuesday, today is Fr...",,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for Android
3,2019-03-28 22:52:09,spectranet,"Oh boy, @Spectranet_NG won't make me work. An...",,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for Android
4,2019-03-28 19:13:22,spectranet,Please can you help me with the address of any...,,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for iPhone
...,...,...,...,...,...,...
95,2019-01-06 10:50:04,spectranet,@OluwadamilolaOG @Spectranet_NG My second devi...,,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for iPhone
96,2019-01-06 10:30:37,spectranet,Hello @Spectranet_NG what does the online mean...,,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for iPhone
97,2019-01-05 17:37:37,spectranet,Left spectranet at home fml,,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for iPhone
98,2019-01-05 07:50:32,spectranet,@ntelcare @NEXT_BILLIONAIR @ntelng Seems you’v...,,Place(_api=<tweepy.api.API object at 0x7fb97f4...,Twitter for iPhone


In [113]:
#Only two tweets with tagged coordinates 
k[k.Coordinates.isna() == False]

Unnamed: 0,Time,Subject,Text,Coordinates,Place,Source
24,2019-03-05 11:57:20,spectranet,Coming soon 🙌🙌\nSpectranet advertisement video...,"{'type': 'Point', 'coordinates': [3.39583, 6.4...",Place(_api=<tweepy.api.API object at 0x7fb97f4...,Instagram
48,2019-02-18 14:14:38,spectranet,#TuneIn if you be @Spectranet_NG user #tunein...,"{'type': 'Point', 'coordinates': [3.39583, 6.4...",Place(_api=<tweepy.api.API object at 0x7fb97f4...,Instagram


In [114]:
#The two tweets report being from the same location
k[k.Coordinates.isna() == False].iloc[0,3]  == k[k.Coordinates.isna() == False].iloc[1,3] 

True

Unfortunately I might have to expand and look at ISPs across the entire Lagos :(