# Twitter Scraping Using TWINT

Twitter Scraping Using TWINT
Twitter is erratic about giving out developer/API access codes. If you can't get one, we can use twint for scraping instead. As a bonus, we can easily get historical data.

In this script, we'll se the geo coords we want first, for those tweets tagged with geo coords, then run the data pull in chunks by year, convert to a dataframe, and save to a csv.

In [None]:
# Even with the scraping tools, Twitter keeps changing the url endpoint, and other  
# access aspects, so we need to make sure to use the latest version of twint by 
# removing any previous versions and getting the latest direct from the github page.
#
# To do this, if you're having problems or errors, run this in conda:
#
# pip3 uninstall twint
# pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint

In [None]:
# necessary imports 
import twint
import pandas as pd
import csv
import datetime 
from datetime import timedelta
import time
import os 
# from collections import Counter

# import and run nest_asyncio to prevent twint's asynchronous loop problems in jupyter notebook
import nest_asyncio
nest_asyncio.apply()


In [None]:
# Function to manage target dates and times
def get_target_dates_min_and_max(t_year = 2020):
    """Function to manage target dates and times"""
  
    target_date_max = datetime.date(t_year+1,1,1)
    target_date_min = datetime.date(t_year,  1,1)

    print("target date range: {} 00:00am to {} 00:00am".format(
        target_date_min.isoformat(), 
        target_date_max.isoformat()))
    
    return target_date_min, target_date_max

In [None]:
# Function to conduct the scrape by calling Twint, assuming you already have a config to pass in

# Since twint's 'Since' variable has inconsistent results, and Twitter returns results by most 
# recent first, and also Twitter will take a max date but not a max date/time,
# we'll oddly chunk through some iterations of searches going back in history by
# garbbing a big chunk, then restarting a couple seconds later at the most recent date not finished.
# Later, at data analysis time, we'll need to clean it up by removing duplicates.

def twitter_scrape_given_twint_config(c, target_date_min, target_date_max, output_filename = "output.csv", city_name = "unknown"):
    """Function to conduct the scrape by calling Twint, assuming you already have a config to pass in"""

    least_recent_search_date = target_date_max  # this will keep us organized as we chunk through
    most_recent_search_date = target_date_max

    tweetcount = int(0)

    # the big loop
    while least_recent_search_date >  target_date_min:

        print("running twint op max date {} 00:00am".format(c.Until))

        twint.run.Search(c)                     # go twint go
    
        df = twint.storage.panda.Tweets_df      # we'll use pandas dataframe for a quick organization

        if int( len(df.index)) < 1:
            print("WARNING: no data returned, breaking out of twint search loops")
            break
        
        else:
            df['date'] = pd.to_datetime(df['date']) # type cast the whole column to make data manip easier

        least_recent_search_date = df['date'].min()
        most_recent_search_date = df['date'].max()

        tweetcount = tweetcount + int( len(df.index) )  # running total 

        df["accessed_cityname"] = city_name

        # write the dataframe to a csv file by chunk
        if os.path.exists(output_filename):
            print("appending to file {}".format(output_filename))
            df.to_csv(output_filename, mode='a', header=False)
        else:
            print("creating file {}".format(output_filename))
            df.to_csv(output_filename, header=True)

        print("date span collected: {} to {}".format( least_recent_search_date, most_recent_search_date ))
        print("total tweet count: {}".format(tweetcount))

        # change the date chunk
        if least_recent_search_date.date() >= most_recent_search_date.date():
            print("WARNING: more than {} records on day {}".format(c.Limit, least_recent_search_date.date()))
            print("  skipping remaining records on this date")
            c.Until = least_recent_search_date.date().isoformat()
        else:
            c.Until = (least_recent_search_date + datetime.timedelta(days=1)).date().isoformat()

        time.sleep(3) # let's pause three seconds to avoid overloading Twitter

    print("done")
