# Twitter API Scraping

### Packages to load

In [1]:
import tweepy
import configparser
import pandas as pd
import numpy as np
import json
from twython import Twython

### Other stuff to load

In [5]:
# nothing

### Loading credentials and Twython authentication

In [2]:
with open("../../../config_files/credentials.json") as infile:
    credentials = json.load(infile)

twitter_client = Twython(credentials["api_key"],
                         credentials["api_secret"],
                         credentials["access_token"],
                         credentials["access_token_secret"])

### Data gathering test from Twython

In [3]:
# trying to gather from Twython

keywords = '#iPhone14Pro'

twitter_client.search(q=keywords)

{'statuses': [{'created_at': 'Thu Sep 15 08:37:39 +0000 2022',
   'id': 1570330997606219776,
   'id_str': '1570330997606219776',
   'text': '出荷されたー！\n明日だぁー！\n#iPhone14Pro #ゴールド https://t.co/W2kUkToG3i',
   'truncated': False,
   'entities': {'hashtags': [{'text': 'iPhone14Pro', 'indices': [15, 27]},
     {'text': 'ゴールド', 'indices': [28, 33]}],
    'symbols': [],
    'user_mentions': [],
    'urls': [],
    'media': [{'id': 1570330989448286208,
      'id_str': '1570330989448286208',
      'indices': [34, 57],
      'media_url': 'http://pbs.twimg.com/media/FcrvtwXaUAAshUn.jpg',
      'media_url_https': 'https://pbs.twimg.com/media/FcrvtwXaUAAshUn.jpg',
      'url': 'https://t.co/W2kUkToG3i',
      'display_url': 'pic.twitter.com/W2kUkToG3i',
      'expanded_url': 'https://twitter.com/RN_666777/status/1570330997606219776/photo/1',
      'type': 'photo',
      'sizes': {'medium': {'w': 554, 'h': 1200, 'resize': 'fit'},
       'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
       'small':

### Loading credentials and Tweepy authentication

In [4]:
# read configs

config = configparser.ConfigParser()
config.read('../../../config_files/config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

In [5]:
# authentication

auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

### Data gathering from Twitter API

#### Option 1

In [10]:
# search by user

# user = 'veritasium'
# limit=300
# tweets = tweepy.Cursor(api.user_timeline, screen_name=user, count=200, tweet_mode='extended').items(limit)

# search by keyword or hashtag (works too for @user)
keywords = "#iPhone14Pro -filter:retweets"
limit = 1000
tweets = tweepy.Cursor(
    api.search_tweets, q=keywords, count=100, lang="en", tweet_mode="extended").items(limit)

# create DataFrame
columns = [
    "User",
    "Tweet_time",
    "Tweet",
    "Hashtag",
    "Retweet",
    "Favorite",
    "Join_time",
    "Follower",
    "Friend",
]

data = []

for tweet in tweets:
    data.append(
        [
            tweet.user.screen_name,
            tweet.created_at,
            tweet.full_text,
            tweet.entities["hashtags"],
            tweet.retweet_count,
            tweet.favorite_count,
            tweet.user.created_at,
            tweet.user.followers_count,
            tweet.user.friends_count,
        ]
    )

df = pd.DataFrame(data, columns=columns)

df.head()

Unnamed: 0,User,Tweet_time,Tweet,Hashtag,Retweet,Favorite,Join_time,Follower,Friend
0,qiandutech11,2022-09-15 07:26:42+00:00,Are you looking for the new design phone cases...,"[{'text': 'iphone14', 'indices': [68, 77]}, {'...",0,0,2021-10-30 01:00:46+00:00,3,26
1,torro_uk,2022-09-15 07:26:40+00:00,Expecting an iPhone 14 delivery tomorrow? Don’...,"[{'text': 'iphone14', 'indices': [163, 172]}, ...",0,0,2013-03-18 19:56:37+00:00,4886,993
2,iShujaAhmedCh,2022-09-15 07:20:40+00:00,The #iPhone14Pro and #iPhone14ProMax are avail...,"[{'text': 'iPhone14Pro', 'indices': [4, 16]}, ...",0,0,2009-08-02 20:52:11+00:00,743,715
3,joanikin,2022-09-15 07:17:29+00:00,Review: “This year’s iPhones (14) defy inflati...,"[{'text': 'iPhone14', 'indices': [181, 190]}, ...",0,0,2007-04-22 04:17:07+00:00,6811,2099
4,leobarnes,2022-09-15 07:16:08+00:00,The #GoProHERO11 was announced yesterday and w...,"[{'text': 'GoProHERO11', 'indices': [4, 16]}, ...",0,0,2015-01-04 19:21:35+00:00,663,860


#### Option 2

In [15]:
# gathering data from API

keywords = '#iPhone14Pro'
limit = 1000

tweets = tweepy.Cursor(api.search_tweets, q=keywords, lang='en', count=100,
                       tweet_mode='extended').items(limit)

# creating DataFrame

columns = ['User', 'DateTime', 'Tweet']
data = []

for tweet in tweets:
    data.append([tweet.user.screen_name, tweet.created_at, tweet.full_text])
    
df = pd.DataFrame(data, columns=columns)

df.head()

Unnamed: 0,User,DateTime,Tweet
0,mMarkGreece,2022-09-15 08:02:21+00:00,@saucyrose1 Not even #itel may be. #iPhone14Pr...
1,Owenjesse8,2022-09-15 08:02:20+00:00,RT @toba_chengi: Who knows the Twitter handle ...
2,nctdreamology,2022-09-15 08:01:51+00:00,RT @Chaerryyyyyyyyy: Rekomendasi ootd ala Kore...
3,Olaniel_art,2022-09-15 08:01:16+00:00,RT @Mimi_Nation22: I love how far my mind can ...
4,phone12s_i,2022-09-15 08:00:51+00:00,RT @techboilers: iPhone 14 Pro Max has scored ...


In [13]:
#df.to_csv('../data/raw/iPhone14Prox10k_RAW_2.csv)

In [6]:
df14pro = pd.read_csv('../data/raw/iPhone14Prox10k_RAW.csv')
df14pro.head()

Unnamed: 0.1,Unnamed: 0,User,DateTime,Tweet
0,0,1202tung,2022-09-14 14:48:27+00:00,RT @insanetweet: iPhone 8 while upgrading to i...
1,1,Rushike74038427,2022-09-14 14:48:05+00:00,I want this case @Mrwhosetheboss \n#iphone14pr...
2,2,papsobu,2022-09-14 14:47:40+00:00,RT @insanetweet: iPhone 8 while upgrading to i...
3,3,_kayrozenlang,2022-09-14 14:47:27+00:00,RT @btslabs_global: 🏆Win iPhone14 Pro\n\nWith ...
4,4,SarenDebnath,2022-09-14 14:46:41+00:00,RT @btslabs_global: 🏆Win iPhone14 Pro\n\nWith ...


In [32]:
df14pro['User'].value_counts()

thebbmafrica       63
ChronicallyTG      41
FaizanA17120449    40
XXFemale1          39
sgmobmart          34
                   ..
p_929303            1
bonxcviii           1
twtfnov             1
axososad            1
Getpaidrealquic     1
Name: User, Length: 8703, dtype: int64

In [8]:
df14pro['Tweet'][0]

'RT @insanetweet: iPhone 8 while upgrading to iOS 16\n\n#iOS16 #iPhone14 #iPhone14Pro \n https://t.co/IOpAaBnHus'

In [9]:
# removing punctuations and lowercase normalization
import string
text = df14pro['Tweet'][0]
text_clean1 = ''.join([i.lower() for i in text if i not in string.punctuation])
text_clean1

'rt insanetweet iphone 8 while upgrading to ios 16\n\nios16 iphone14 iphone14pro \n httpstcoiopaabnhus'

In [11]:
# removing \n
import re
text_clean2 = re.sub('\n', ' ' , text_clean1)
text_clean2

'rt insanetweet iphone 8 while upgrading to ios 16  ios16 iphone14 iphone14pro   httpstcoiopaabnhus'

In [12]:
# removing url
text_clean3 = result = re.sub(r'http\S+', '', text_clean2)
text_clean3

'rt insanetweet iphone 8 while upgrading to ios 16  ios16 iphone14 iphone14pro   '

In [27]:
# importing stopwords
import nltk
from nltk.corpus import stopwords
cachedStopWords = stopwords.words('english')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [31]:
# removing stopwords
text = text_clean3
text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
print(text)

rt insanetweet iphone 8 upgrading ios 16 ios16 iphone14 iphone14pro
