# Twitter Data Extraction
**`30 day archive`**

Extracting analogous ISP data for an aspect-based sentiment analysis task. Here, analogous is considered to be ISP providers (that do not double as telecom providers) from other countries asides Nigeria e.g. the US and Canada

## 1. Import packages

In [1]:
import tweepy
import pandas as pd
from datetime import datetime

## 2. Setup and connect to Twitter API

In [15]:
#Load twitter developer credentials
%run ../../src/credentials/twitter_credentials

#Create the authentication object
auth = tweepy.OAuthHandler(api_key,api_secret_key)

#Set the access token and access token secret
auth.set_access_token(access_token,access_token_secret)

#Create the API object
api = tweepy.API(auth) 

#Dev environment for the 30day archive endpoint
dev_env = 'extraction30days'

## 3. Define function to store tweets in pandas dataframe

In [43]:
def tweets_to_df(isp_name,api_result):
    
    """
    Function to extract relevant properties from api results (tweets objects) and store 
    in a pandas dataframe
    
    Input(s):
        - isp_name (str): The name of the ISP
        - api_result_list (list): List containing API results for a yearly quarter's subintervals
        
    Output(s):
        - main_df (DataFrame): Pandas DataFrame of tweets (and their properties) from the yearly quarter
    
    """
    
    #Empty dataframe to compile data from all yearly quarter subintervals
    main_df = pd.DataFrame()
    
    #List to store the tweets
    tweets = []
        
    #Iterate through all the tweets
    for tweet in api_result:

        #Dictionary to store tweet properties
        tweet_prop = {}

        #Store the ISP's name
        tweet_prop['ISP_Name'] = isp_name

        #Store the tweet time
        tweet_prop['Time'] = tweet.created_at

        #Store the tweet text – ensuring that the full text is gotten (if truncated)
        if tweet.truncated:
            tweet_prop['Text'] = tweet.extended_tweet['full_text']
        else:
            tweet_prop['Text'] = tweet.text

        #Store the coordinates (if available)
        tweet_prop['Coordinates'] = tweet.coordinates

        #Store the place
        tweet_prop['Place'] = tweet.place

        #Store the source (e.g. Android, iOS, Desktop)
        tweet_prop['Source'] = tweet.source

        #Store the tweet in the tweets list
        tweets.append(tweet_prop)
    
    #Convert the dictionary to a pandas dataframe
    df = pd.DataFrame.from_dict(tweets)
        
    return df

## 3. Setup and run a search query
**`Nov 27th checkpoint`**

## Nigerian ISPs

In [70]:
#Spectranet ISP
spectranet_handles = ['-from:spectranet_NG','Spectr_net','SPECTRANETLTE','spectranet__NG']

#IPNX ISP
ipnx_handles = ['-from:ipNXTweet','IpnxSupport','iRecruite']

#Tizeti (Wifi.ng) ISP
tizeti_handles = ['-from:tizeti','wifisupport1']

#Dataframe to store results in
nigerian_isp_df = pd.DataFrame()

for isp_name in ['spectranet','ipnx','tizeti']:
    
    #Get handles to exclude
    excl_handles = ' -from:'.join(eval(isp_name +'_handles'))
    
    #Define query
    api_query = f""" {isp_name} {excl_handles} """
    
    #Extract tweets
    tweets = api.search_30_day(dev_env, api_query, fromDate = '202111130000')
    tweets2 = api.search_30_day(dev_env, api_query, fromDate = '202110300000', toDate = '202111132359')
    
    
    #Create pandas dataframes
    df1 = tweets_to_df(isp_name,tweets)
    df2 = tweets_to_df(isp_name,tweets2)
    
    #Merge and remove duplicates
    merged_df = pd.concat([df1,df2]).drop_duplicates(subset=['Text'])
    
    #Append dataframe to the dataframe containing all the Nigerian ISPs tweets
    nigerian_isp_df = nigerian_isp_df.append(merged_df)
    

In [72]:
nigerian_isp_df.head()

Unnamed: 0,ISP_Name,Time,Text,Coordinates,Place,Source
0,spectranet,2021-11-27 14:41:14+00:00,RT @iamrenike: The sexual tension between Spec...,,,Twitter for Android
1,spectranet,2021-11-27 13:59:19+00:00,Spectranet or Smile? Which is more reliable?,,,Twitter for iPhone
2,spectranet,2021-11-27 11:54:37+00:00,"Spectranet, and Glo Dey cook me seriously for ...",,,Twitter for iPhone
3,spectranet,2021-11-27 11:53:48+00:00,Spectranet offer State-of-the-art dedicated li...,,,Twitter Web App
4,spectranet,2021-11-27 11:17:04+00:00,@Rhanty - Lmao make I run the PlayStation plus...,,,Twitter for iPhone


## Foreign ISPs

### a. Mediacom

In [53]:
#Extract tweets on US ISP Mediacom
mediacom_query = """ mediacom -from:MediaComGlobal -from:MediaComUS -from:MediacomSupport """

In [29]:
#Pull the tweets
mediacom_tweets = api.search_30_day(dev_env, mediacom_query, fromDate = '202111130000')
mediacom_tweets2 = api.search_30_day(dev_env, mediacom_query, fromDate = '202110300000', toDate = '202111132359')

In [45]:
#Create pandas dataframe
mediacom_df1 = tweets_to_df('mediacom',mediacom_tweets)

#Create pandas dataframe
mediacom_df2 = tweets_to_df('mediacom',mediacom_tweets2)

#### Visualize the dataframes

In [48]:
mediacom_df1.head()

Unnamed: 0,ISP_Name,Time,Text,Coordinates,Place,Source
0,mediacom,2021-11-27 16:35:17+00:00,RT @wciu: Don't miss out today on @fenwickfria...,,,Twitter for Android
1,mediacom,2021-11-27 16:23:13+00:00,@TroyBanning @MediacomCable We've had YouTube ...,,,Twitter for Android
2,mediacom,2021-11-27 15:50:00+00:00,Don't miss out today on @fenwickfriars vs. #Ka...,,,TweetDeck
3,mediacom,2021-11-27 15:45:33+00:00,So now instead of 8-9 apparently it’s closer t...,,,Twitter for iPhone
4,mediacom,2021-11-27 15:42:25+00:00,RT @wciu: Calling all #IHSA football fans! Don...,,,Twitter for iPhone


In [54]:
mediacom_df2.head()

Unnamed: 0,ISP_Name,Time,Text,Coordinates,Place,Source
0,mediacom,2021-11-13 23:58:30+00:00,RT @brianneDMR: In a hypothetical 2024 rematch...,,,Twitter for iPad
1,mediacom,2021-11-13 23:57:32+00:00,RT @brianneDMR: In a hypothetical 2024 rematch...,,,Twitter for iPhone
2,mediacom,2021-11-13 23:56:33+00:00,RT @brianneDMR: In a hypothetical 2024 rematch...,,,Twitter Web App
3,mediacom,2021-11-13 23:55:23+00:00,RT @brianneDMR: In a hypothetical 2024 rematch...,,,Twitter Web App
4,mediacom,2021-11-13 23:54:37+00:00,RT @brianneDMR: In a hypothetical 2024 rematch...,,,Twitter Web App


#### Fuse the dataframes together and drop duplicates

In [55]:
mediacom_merged = pd.concat([mediacom_df1,mediacom_df2]).drop_duplicates(subset=['Text'])

In [56]:
mediacom_merged.shape

(142, 6)

---

### b. HughesNet 

In [59]:
#Extract tweets on US ISP Mediacom
hughesnet_query = """ HughesNet -from:HughesNet -from:HughessNet -from:AskHughes """

In [60]:
#Pull the tweets
hughesnet_tweets = api.search_30_day(dev_env, hughesnet_query, fromDate = '202111130000')
hughesnet_tweets2 = api.search_30_day(dev_env, hughesnet_query, fromDate = '202110300000', toDate = '202111132359')

In [63]:
#Create pandas dataframe
hughesnet_df1 = tweets_to_df('hughesnet',hughesnet_tweets)

#Create pandas dataframe
hughesnet_df2 = tweets_to_df('mediacom',hughesnet_tweets2)

#Merge and drop duplicates
hughesnet_merged = pd.concat([hughesnet_df1,hughesnet_df2]).drop_duplicates(subset=['Text'])

print(f"Row count dropped from {len(hughesnet_df1)+len(hughesnet_df2)} to {hughesnet_merged.shape[0]}")

Row count dropped from 200 to 198


---

### b. HughesNet 

In [73]:
cox = ['cox internet', ['-from:coxbusiness','CarolCox']]
spectrum = ['spectrum internet',['-from:Ask_Spectrum','GetSpectrum','SpectrumBiz']]
suddenlink = ['suddenlink internet', ['-from:SuddenlinkHelp']]
xfinity = ['xfinity internet',['-from:Xfinity']]
rcn_internet = ['rcn internet', ['-from:RCNconnects','RCNBusiness']]
verizon = ['verizon internet',['-from:VerizonSupport']]

In [75]:
us_internet_df = pd.DataFrame()

for isp in [cox,spectrum,suddenlink, xfinity, rcn_internet, verizon]:
    
    #Get handles to exclude
    excl_handles = ' -from:'.join(isp[1])
    
    #Define the query
    api_query = f""" {isp[0]} {excl_handles} """
    
    #Extract tweets
    tweets1 = api.search_30_day(dev_env, api_query, fromDate = '202110300000', toDate = '202111062359')
    tweets2 = api.search_30_day(dev_env, api_query, fromDate = '202111070000', toDate = '202111142359')
    tweets3 = api.search_30_day(dev_env, api_query, fromDate = '202111150000', toDate = '202111222359')
    tweets4 = api.search_30_day(dev_env, api_query, fromDate = '202111230000')
    
    
    #Create pandas dataframes
    df1 = tweets_to_df(isp[0],tweets1)
    df2 = tweets_to_df(isp[0],tweets2)
    df3 = tweets_to_df(isp[0],tweets3)
    df4 = tweets_to_df(isp[0],tweets4)
    
    #Merge and remove duplicates
    us_merged_df = pd.concat([df1,df2,df3,df4]).drop_duplicates(subset=['Text'])
    
    #Append dataframe to the dataframe containing all the Nigerian ISPs tweets
    us_internet_df = us_internet_df.append(us_merged_df)
    

In [77]:
us_internet_df.head()

Unnamed: 0,ISP_Name,Time,Text,Coordinates,Place,Source
0,cox internet,2021-11-06 23:19:40+00:00,Hey COX is internet down in the 89101 area?,,,Twitter for iPhone
1,cox internet,2021-11-06 22:50:08+00:00,#PartnersInAction: Cox Communications wants to...,,,Twitter Web App
2,cox internet,2021-11-06 21:46:03+00:00,@CTrevorNelson @Verizon @ATT I discovered the ...,,,Twitter Web App
3,cox internet,2021-11-06 21:41:24+00:00,Internet can't seem to keep it steady at all.....,,,Twitter for Android
4,cox internet,2021-11-06 20:22:22+00:00,@J21SportsFan @PigLouie Dumb question but it r...,,,Twitter for iPhone


In [79]:
final_nov_27th_df = pd.concat([nigerian_isp_df, mediacom_merged, hughesnet_merged, us_internet_df])

In [82]:
final_nov_27th_df.to_csv("../../data/analogous-data/analogous_tweets_nov_27.csv",index=False)

---