# Notebook for Scraping Twitter Using Tweepy

**pip install Tweepy to access the Tweepy library**

## Importing necessary libraries

In [2]:
import tweepy
import pandas as pd
import time
import regex as re

## Credentials for interacting with Twitter API

You must apply to become a Twitter developer in order to receive credentials

In [3]:
#Credentials

credentials_df = pd.read_csv('credentials.csv',header=None,names=['name','key'])

consumer_key = credentials_df["key"][0]
consumer_secret = credentials_df["key"][1]
access_token = credentials_df["key"][2]
access_token_secret = credentials_df["key"][3]

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

In [4]:
# Extracting coordinates of tweets

def extract_coordinates(row):
    if row['Tweet Coordinates']:
        return row['Tweet Coordinates']['coordinates']
    else:
        return None

# Extracting place of tweet

def extract_place(row):
    if row['Place Info']:
        return row['Place Info'].full_name
    else:
        return None

In [5]:
# Removing any unnecessary characters
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

# Displaying complete text
pd.set_option('display.max_colwidth', None)

## Extracting tweets from a person

In [104]:
username = 'narendramodi'
max_tweets = 150
 
tweets = tweepy.Cursor(api.user_timeline,id=username, tweet_mode='extended').items(max_tweets)
 
# Information you want from the tweet
tweets_list = [[remove_url(tweet.full_text), tweet.created_at, tweet.id_str, tweet.user.screen_name, tweet.coordinates, tweet.place, tweet.retweet_count, tweet.favorite_count, tweet.lang, tweet.source, tweet.in_reply_to_status_id_str, tweet.in_reply_to_user_id_str, tweet.is_quote_status] for tweet in tweets]

# Creating a DataFr
tweets_df = pd.DataFrame(tweets_list,columns=['Tweet Text', 'Tweet Datetime', 'Tweet Id', 'Twitter @ Name', 'Tweet Coordinates', 'Place Info', 'Retweets', 'Favorites', 'Language', 'Source', 'Replied Tweet Id', 'Replied Tweet User Id Str', 'Quote Status Bool'])
 
# Coordinates extraction
tweets_df['Tweet Coordinates'] = tweets_df.apply(extract_coordinates,axis=1)
 
# Place extraction
tweets_df['Place Info'] = tweets_df.apply(extract_place,axis=1)

In [105]:
# Displaying 5 tweets
tweets_df.head(5)

Unnamed: 0,Tweet Text,Tweet Datetime,Tweet Id,Twitter @ Name,Tweet Coordinates,Place Info,Retweets,Favorites,Language,Source,Replied Tweet Id,Replied Tweet User Id Str,Quote Status Bool
0,Interacted with PGelsinger the CEO of intel Had extensive discussions on the role of technology in furthering human progress the DigitalIndia efforts and investment opportunities in India,2021-04-13 16:08:02,1382002649432649728,narendramodi,,,2017,13497,en,Twitter for iPhone,,,False
1,Best wishes for the holy month of Ramzan,2021-04-13 14:42:11,1381981045369806848,narendramodi,,,2845,23551,en,Twitter for iPhone,,,False
2,Addressing the raisinadialogue,2021-04-13 14:25:00,1381976719893012481,narendramodi,,,1823,7405,en,Twitter Media Studio,,,False
3,Best wishes on the auspicious occasion of Baisakhi,2021-04-13 07:19:31,1381869644781338625,narendramodi,,,2590,17366,en,Twitter for iPhone,,,False
4,Tributes to those martyred in the Jallianwala Bagh massacre Their courage heroism and sacrifice gives strength to every Indian,2021-04-13 02:35:25,1381798148817907724,narendramodi,,,4996,31558,en,Twitter for iPhone,,,False


In [106]:
#Displaying only the top 5 tweet text
tweets_df.head(5)["Tweet Text"]

0    Interacted with PGelsinger the CEO of intel Had extensive discussions on the role of technology in furthering human progress the DigitalIndia efforts and investment opportunities in India
1                                                                                                                                                       Best wishes for the holy month of Ramzan
2                                                                                                                                                                 Addressing the raisinadialogue
3                                                                                                                                             Best wishes on the auspicious occasion of Baisakhi
4                                                                 Tributes to those martyred in the Jallianwala Bagh massacre Their courage heroism and sacrifice gives strength to every Indian
Name: Tweet Text, dtype: object

## Extracting tweets from a text keyword

In [109]:
text_query = 'Coronavirus'
max_tweets = 150
 
tweets = tweepy.Cursor(api.search,q=text_query,tweet_mode='extended').items(max_tweets)
 
tweets_list = [[remove_url(tweet.full_text), tweet.created_at, tweet.id_str, tweet.user.name, 
                tweet.user.screen_name, tweet.user.id_str, tweet.user.location, 
                tweet.user.url, tweet.user.description, tweet.user.verified, tweet.user.followers_count, 
                tweet.user.friends_count, tweet.user.favourites_count, tweet.user.statuses_count, tweet.user.listed_count, 
                tweet.user.created_at, tweet.user.profile_image_url_https, tweet.user.default_profile, tweet.user.default_profile_image] for tweet in tweets]

tweets_df = pd.DataFrame(tweets_list,columns=['Tweet Text', 'Tweet Datetime', 'Tweet Id', 'Twitter Username', 'Twitter @ name',
                                             'Twitter User Id', 'Twitter User Location', 'URL in Bio', 'Twitter Bio',
                                             'User Verified Status', 'Users Following Count',
                                             'Number users this account is following', 'Users Number of Likes', 'Users Tweet Count',
                                             'Lists Containing User', 'Account Created Time', 'Profile Image URL',
                                             'User Default Profile', 'User Default Profile Image'])

In [111]:
tweets_df.head()

Unnamed: 0,Tweet Text,Tweet Datetime,Tweet Id,Twitter Username,Twitter @ name,Twitter User Id,Twitter User Location,URL in Bio,Twitter Bio,User Verified Status,Users Following Count,Number users this account is following,Users Number of Likes,Users Tweet Count,Lists Containing User,Account Created Time,Profile Image URL,User Default Profile,User Default Profile Image
0,RT MarinaMedvin Preclinical research by Israeli scientists published in Microbiome indicates that Kefir could be used to treat cytokin,2021-04-13 22:44:03,1382102309145022464,Sam De Moline,SamMoline1,1288547117968117761,"Minnesota, USA",,Proud Mom to two teens•Oath taker•Christian•writer•not looking for love•#Trump2020•MS Warrior• 🇺🇸✞🇺🇸✞🇺🇸ReTweeted by @JackPosobiec & @JackMurphyLive,False,1233,2010,19983,17968,1,2020-07-29 18:49:21,https://pbs.twimg.com/profile_images/1351527914941140993/vqvthGuP_normal.jpg,True,False
1,Bijna n miljoen extra coronavaccinaties geteld 31626665126,2021-04-13 22:44:03,1382102308838727681,Dutch Taxi Company,dutchtaxic,2827075221,"Schiphol-Rijk, Haarlemmermeer",https://t.co/djL6uqy90f,Dutch Taxi Company | #Amsterdam #Dutch #Schiphol #Airport# Taxi #service | official account | +31626665126,False,2850,136,6,115243,44,2014-10-13 08:44:38,https://pbs.twimg.com/profile_images/704698907045994496/MwXStfhr_normal.jpg,False,False
2,How the Coronavirus Variants Are Spreading in New York City The New York Times Search your zip code and if you are from New York City but dont have access to see this let me know and Ill search,2021-04-13 22:44:03,1382102308142596110,🙃,innaaatalks,236633598,New York City,https://t.co/HgGjXWkh1P,Mental Health Advocate • Sarcastic • Dog Mom • Vegetarian • Activist • Nature • Clinical Psychology — she/her ☀️,False,1424,1127,5227,8850,7,2011-01-11 01:23:19,https://pbs.twimg.com/profile_images/1372721308300115969/KVwA71-q_normal.jpg,False,False
3,Germany is supporting Ghana to do a feasibility study on vaccine production in Ghana GhanaWeb22,2021-04-13 22:44:02,1382102305282080775,GhanaWeb,TheGhanaWeb,1673236537,,https://t.co/AF27crjAXY,"GhanaWeb is Ghana's first vertical portal, content curation and syndication website that publishes news, business, entertainment, sports and opinion articles.",False,105753,126,412,182838,110,2013-08-15 13:39:02,https://pbs.twimg.com/profile_images/1361369604531101697/vRU-IU48_normal.jpg,False,False
4,RT FridaGhitis Researchers analyzed 38 million Englishlanguage articles about the pandemic found that President Trump was the single la,2021-04-13 22:44:02,1382102304573251585,Becca,BeckyRae12345,3082370998,,,#TheResistance 🏳️‍🌈,False,1986,5002,162088,508790,9,2015-03-09 19:56:48,https://pbs.twimg.com/profile_images/1293357947339378689/PfxOfak1_normal.jpg,True,False
