# Twitter Scraper

References:
- https://medium.com/dataseries/how-to-scrape-millions-of-tweets-using-snscrape-195ee3594721
- https://www.youtube.com/watch?v=PUMMCLrVn8A

In [13]:
#!pip3 -qqq install snscrape 
#!pip install tqdm

In [14]:
import snscrape.modules.twitter as snt
from tqdm import tqdm  # for our progress bar
import pandas as pd
import re

In [15]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [16]:
def clean_tweet(tweet):
    tweet.content = re.sub(r'http\S+', '', tweet.content)
    tweet.content = re.sub(r'@\S+', '', tweet.content)
    tweet.content = re.sub(r'#\S+', '', tweet.content)
    tweet.content = remove_emojis (tweet.content)

In [17]:
def grab_tweets(total_number, topic): 
  scraper = snt.TwitterSearchScraper(topic)
  #file = open("results.txt", "w")
  final_tweets = []

  for index, tweet in tqdm(enumerate(scraper.get_items()), total = total_number, leave = False):
    
    clean_tweet(tweet)
    
    data = [
      tweet.date,
      tweet.id,
      tweet.content,
      tweet.user.username,
      tweet.likeCount,
      tweet.retweetCount,
    ]
    final_tweets.append(data)
    if(index == total_number):
      break
  return final_tweets

Change the number of tweets and the topic for obtain different results

In [18]:
num_tweets = 1000
topic = "female refugee"
csv_path = "tweet_dataset/"+topic+"_tweet.csv"
final_tweets_data = grab_tweets(num_tweets, topic)

tweet_df = pd.DataFrame(final_tweets_data, columns=["date","id", "content", "username", "like_count", "retweet_count"])
tweet_df.to_csv(csv_path)

                                                  