# STEP 1: PYTHON PACKAGES INSTALLATION

In [1]:
!pip install tweepy
!pip install unidecode



# STEP 2: IMPORT PACKAGES


In [2]:
#import dependencies
import tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
from unidecode import unidecode
import time
import datetime
from tqdm import tqdm 
import pandas as pd  
import numpy as np

# STEP 3: AUTHENTICATING TO TWITTER'S API

In [3]:
consumer_key = 'YOUR-CONSUMER-KEY'
consumer_secret = 'YOUR-CONSUMER-SECRET'
access_token = 'YOUR-ACCESS-TOKEN'
access_secret = 'YOUR-ACCESS-SECRET'

# STEP 4: CONNECT TO TWITTER API USING THE SECRET KEY AND ACCESS TOKEN

In [4]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

# STEP 5: DEFINE A FUNCTION THAT WILL TAKE OUR SEARCH QUERY

In [5]:
def tweetSearch(query, limit):
    """
    This function will search a query provided in the twitter and,
    retun a list of all tweets that have a query. 
    """

    # Create a blank variable
    tweets = []

    # Iterate through Twitter using Tweepy to find our query with our defined limit
    for page in tweepy.Cursor(
        api.search, q=query, count=limit, tweet_mode="extended"
    ).pages(limit):
        for tweet in page:
            tweets.append(tweet)

    # return tweets
    return tweets

# STEP 6: CREATE A FUNCTION TO SAVE TWEETS INTO A DATAFRAME

In [6]:

def tweets_to_data_frame(tweets):
    """
    This function will receive tweets and collect specific data from it such as place, tweet's text,likes 
    retweets and save them into a pandas data frame.
    
    This function will return a pandas data frame that contains data from twitter.
    """
    df = pd.DataFrame(data=[tweet.full_text.encode('utf-8') for tweet in tweets], columns=["Tweets"])

    df["id"] = np.array([tweet.id for tweet in tweets])
    df["lens"] = np.array([len(tweet.full_text) for tweet in tweets])
    df["date"] = np.array([tweet.created_at for tweet in tweets])
    df["place"] = np.array([tweet.place for tweet in tweets])
    df["coordinateS"] = np.array([tweet.coordinates for tweet in tweets])
    df["lang"] = np.array([tweet.lang for tweet in tweets])
    df["source"] = np.array([tweet.source for tweet in tweets])
    df["likes"] = np.array([tweet.favorite_count for tweet in tweets])
    df["retweets"] = np.array([tweet.retweet_count for tweet in tweets])

    return df

# STEP 7: ADD TWITTER HASHTAGS RELATED TO GENDER-BASED VIOLENCE

In [7]:
hashtags = ['#GBV', '#sexism', '#rape']

# STEP 8: RUN BOTH FUNCTIONS TO COLLECT DATA FROM TWITTER RELATED TO THE HASHTAGS LISTED ABOVE

In [8]:
total_tweets = 0

"""
The following for loop will collect a tweets that have the hashtags
 mentioned in the list and save the tweets into csv file
"""

for n in tqdm(hashtags):
    # first we fetch all tweets that have specific hashtag
    hash_tweets = tweetSearch(query=n,limit=7000)
    total_tweets += int(len(hash_tweets))
    
    # second we convert our tweets into datarame
    df = tweets_to_data_frame(hash_tweets)
    
    #third we save the dataframe into csv file
    df.to_csv("scraped_data.csv".format(n))

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [04:29<00:00, 89.92s/it]


In [9]:
df

Unnamed: 0,Tweets,id,lens,date,place,coordinateS,lang,source,likes,retweets
0,b'RT @eliasamare: This is exactly the arrogant...,1394745675418640386,148,2021-05-18 20:04:16,,,en,Twitter for Android,0,67
1,"b""@kushibo @DaniPayson Encouraging ANYONE in t...",1394744832317349888,332,2021-05-18 20:00:55,Place(_api=<tweepy.api.API object at 0x000001E...,,en,Twitter for Android,0,0
2,b'RT @PaulDereume: @jensdad4biden @QuippieChic...,1394744504377241601,140,2021-05-18 19:59:37,,,en,Twitter for iPhone,0,1
3,b'RT @BombshellDAILY: REPUBLICAN RAPED HIS CLI...,1394743634994638852,140,2021-05-18 19:56:10,,,en,Twitter for iPad,0,71
4,b'@jensdad4biden @QuippieChick @PrezLives2022 ...,1394742794304978944,508,2021-05-18 19:52:49,,,en,Twitter Web App,0,1
...,...,...,...,...,...,...,...,...,...,...
6220,b'RT @ZeeNewsCrime: \xe0\xa4\x8f\xe0\xa4\x95 \...,1392032659493199874,140,2021-05-11 08:23:43,,,hi,Twitter for Android,0,10
6221,b'\xc2\xbfConoces las propiedades del rape? \x...,1392032550084923393,236,2021-05-11 08:23:17,,,es,Twitter Web App,0,0
6222,b'RT @DograSonakshi: Kitni lasho aur khoon ko ...,1392032252037455876,140,2021-05-11 08:22:06,,,en,Twitter for Android,0,298
6223,b'RT @i_Aakashsingh: This is very heinous crim...,1392032187692634112,139,2021-05-11 08:21:50,,,en,Twitter for Android,0,3


df

In [10]:
# show total number of tweets collected
print("total_tweets: {}".format(total_tweets))

total_tweets: 9440
