# Using `tweepy` to Pull Tweets from Twitter

Author: Lu ZhiPing

In [1]:
from dotenv import load_dotenv
import os
import pandas as pd
import tweepy
load_dotenv()

TOKEN = os.getenv("TWITTER_BEARE_TOKEN")
PYMONGO_USERNAME = os.getenv("PYMONGO_USERNAME")
PYMONGO_PASSWORD = os.getenv("PYMONGO_PASSWORD")
MONGO_URL = os.getenv("MONGO_URL")

tw_client = tweepy.Client(bearer_token=TOKEN)


from pymongo import MongoClient
mg_client = MongoClient(
    MONGO_URL,
    username=PYMONGO_USERNAME,
    password=PYMONGO_PASSWORD
)

db = mg_client["PLP"]
collection = db["AStarCOVID"]

df = pd.read_csv("tweet_id_sg_sample.csv")
df.shape

(237045, 22)

In [2]:
import glob
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "batch_*.csv"))

dataframes = [pd.read_csv(file) for file in csv_files]
ddf = pd.concat(dataframes, ignore_index=True, axis=0)
ddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45833 entries, 0 to 45832
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_ID  45833 non-null  int64 
 1   Text      45833 non-null  object
dtypes: int64(1), object(1)
memory usage: 716.3+ KB


In [3]:
ddf.head()

Unnamed: 0,tweet_ID,Text
0,1224742687137419264,Coronavirus outbreak: Singapore to provide S$1...
1,1224741485859557378,Coronavirus: Royal Caribbean warns of more cru...
2,1224739548690694144,The Wuhan Coronavirus Poses Three Tests for Gl...
3,1224738837198278656,@asadowaisi Pakistan is not rescuing their peo...
4,1224738184614907904,The world will pay a growth price for the Wuha...


In [4]:
df = df[~df.tweet_ID.isin(ddf.tweet_ID)]
df.shape

(191209, 22)

In [5]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

process_list = list(chunks(df.tweet_ID.to_list(), 100))
len(process_list)

1913

In [6]:
len(process_list[0])

100

In [7]:
len(process_list) / 298

6.419463087248322

In [8]:
from tqdm import tqdm
from time import sleep
tweets = []

for idx, item in tqdm(enumerate(process_list), total=len(process_list)):
    if idx == 0 or idx % 298 != 0:
        responds = tw_client.get_tweets(ids=item)
        if responds.data:
            for doc in responds.data:
                tweets.append([doc.id, doc.text])
    elif idx % 298 == 0:
        print("Now sleep for 15 minutes")
        sleep(900)

 16%|█▌        | 298/1913 [02:28<16:08,  1.67it/s]

Now sleep for 15 minutes


 31%|███       | 596/1913 [20:23<12:53,  1.70it/s]     

Now sleep for 15 minutes


 47%|████▋     | 894/1913 [38:16<09:43,  1.75it/s]    

Now sleep for 15 minutes


 62%|██████▏   | 1192/1913 [56:01<06:29,  1.85it/s]   

Now sleep for 15 minutes


 78%|███████▊  | 1490/1913 [1:13:42<03:58,  1.77it/s]    

Now sleep for 15 minutes


 93%|█████████▎| 1788/1913 [1:31:17<01:03,  1.96it/s]    

Now sleep for 15 minutes


100%|██████████| 1913/1913 [1:47:21<00:00,  3.37s/it]   


In [9]:
ddf_2 = pd.DataFrame(tweets, columns=["tweet_ID", "Text"])

In [10]:
ddf_2

Unnamed: 0,tweet_ID,Text
0,1245550415581716481,HDB closes Bukit Merah branch office after sec...
1,1245550321511718912,HDB closes Bukit Merah branch office after sec...
2,1245550270190419969,Quarantine stress baking? 😆\n\nhttps://t.co/zH...
3,1245550206457954305,Every vaccine and treatment in development for...
4,1245548702233583618,this was the second read.. \nhttps://t.co/wYID...
...,...,...
158942,1345035968789635072,Happy new year everyone! Hopefully 2021 will b...
158943,1345035677847724033,@andrew_lilico More of a distinction should be...
158944,1345035499618988035,@SashaAlexandre2 @darakass We mostly survive c...
158945,1345035072705990663,"Ngee Ann City, Bedok Mall and several restaura..."


In [11]:
ddf_2.shape

(158947, 2)

In [12]:
ddf.shape

(45833, 2)

In [13]:
# ddf_2.to_csv("batch_3.csv", index=None)

In [14]:
dataframe = pd.concat([ddf, ddf_2], ignore_index=True, axis=0)
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204780 entries, 0 to 204779
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tweet_ID  204780 non-null  int64 
 1   Text      204780 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.1+ MB


In [19]:
final = df.merge(dataframe, on="tweet_ID")
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161390 entries, 0 to 161389
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   tweet_ID            161390 non-null  int64  
 1   user_ID             161390 non-null  int64  
 2   t1                  161390 non-null  int64  
 3   t2                  161390 non-null  int64  
 4   t3                  161390 non-null  int64  
 5   t4                  161390 non-null  int64  
 6   t5                  161390 non-null  int64  
 7   t6                  161390 non-null  int64  
 8   t7                  161390 non-null  int64  
 9   t8                  161390 non-null  int64  
 10  t9                  161390 non-null  int64  
 11  t10                 161390 non-null  int64  
 12  valence_intensity   161390 non-null  float64
 13  anger_intensity     161390 non-null  float64
 14  fear_intensity      161390 non-null  float64
 15  sadness_intensity   161390 non-nul

In [20]:
for _, row in tqdm(final.iterrows(), total=final.shape[0]):
    document = dict()
    for i in row.index:
        document[i] = row[i]
    # print(document)
    collection.insert_one(document)

100%|██████████| 161390/161390 [19:19<00:00, 139.18it/s]


In [1]:
from sentiment.dataset.load_dataset import LoadDataset

dataset = LoadDataset(
    database_name="PLP",
    collection_name="AStarCOVID",
    n_rows=100
)

len(dataset)

NOTICE: sentiment log file will be at /Users/johnnylu/tweet_sentiment/sentiment/logs/sentiment.log


161390

In [2]:
df = dataset.to_pandas()

100%|██████████| 100/100 [00:00<00:00, 514.92it/s]


In [3]:
df

Unnamed: 0,_id,tweet_ID,user_ID,t1,t2,t3,t4,t5,t6,t7,...,anger_intensity,fear_intensity,sadness_intensity,joy_intensity,sentiment_category,emotion_category,keyword_used,country_region,date_stamp,Text
0,634637137380598a236355ae,1245550415581716481,37874853,1,0,0,0,0,0,0,...,0.440,0.490,0.437,0.281,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,HDB closes Bukit Merah branch office after sec...
1,634637137380598a236355af,1245550321511718912,44290654,1,0,0,0,0,0,0,...,0.440,0.490,0.437,0.281,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,HDB closes Bukit Merah branch office after sec...
2,634637137380598a236355b0,1245550270190419969,115624161,1,1,1,1,0,0,0,...,0.476,0.512,0.446,0.162,negative,fear,covid,Singapore,2020-04-02 00:00:00,Quarantine stress baking? 😆\n\nhttps://t.co/zH...
3,634637137380598a236355b1,1245550206457954305,20155794,1,0,0,0,0,0,0,...,0.334,0.423,0.340,0.319,neutral,no specific emotion,covid,Singapore,2020-04-02 00:00:00,Every vaccine and treatment in development for...
4,634637137380598a236355b2,1245548702233583618,35202527,1,1,1,1,0,0,0,...,0.346,0.348,0.425,0.255,negative,sadness,covid,Singapore,2020-04-02 00:00:00,this was the second read.. \nhttps://t.co/wYID...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,634637147380598a2363560d,1245576204029054976,97911392,1,1,0,0,0,0,0,...,0.516,0.414,0.406,0.198,negative,anger,covid,Singapore,2020-04-02 00:00:00,If you do not take your employees health and s...
96,634637147380598a2363560e,1245575450606223360,3221009976,0,0,0,0,0,0,0,...,0.419,0.493,0.482,0.278,negative,fear,covid,Singapore,2020-04-02 00:00:00,Collective Action Required To Alleviate Stress...
97,634637147380598a2363560f,1245575148511481856,226636502,1,0,0,0,1,0,0,...,0.429,0.470,0.403,0.228,negative,fear,covid,Singapore,2020-04-02 00:00:00,COVID-19 - You should be spending a possible t...
98,634637147380598a23635610,1245574615809708038,442445678,1,1,1,1,1,1,1,...,0.369,0.331,0.392,0.189,negative,sadness,covid,Singapore,2020-04-02 00:00:00,please delete covid-19 https://t.co/Zar3segMC2


In [5]:
sentiments = []

for item in dataset:
    sentiments.append(item["sentiment_category"])
len(sentiments)

100

In [6]:
sentiments

['neutral',
 'neutral',
 'negative',
 'neutral',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'positive',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive',
 'positive',
 'neutral',
 'positive',
 'negative',
 'neutral',
 'neutral',
 'very negative',
 'positive',
 'positive',
 'negative',
 'very negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'neutral',
 'very negative',
 'negative',
 'positive',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'positive',
 'positive',
 'negative',
 'neutral',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',

In [7]:
from collections import Counter


Counter(sentiments)

Counter({'neutral': 16, 'negative': 45, 'positive': 34, 'very negative': 5})

In [8]:
dataset.n_rows

100

In [9]:
dataset.n_rows = len(dataset)

In [10]:
dataset.n_rows

161390