In [15]:
import datetime
import json
import os
import re

import numpy as np
import pandas as pd
import tweepy
from tweepy import OAuthHandler

# Set up Tweepy with my Twitter API info

In [2]:
with open('api_keys.json') as f:
    keys = f.read()

In [3]:
keys = json.loads(keys)

In [4]:
consumer_key = keys['consumer_key']
consumer_secret = keys['consumer_secret']
access_token = keys['access_token']
access_token_secret = keys['access_token_secret']

In [5]:
auth = OAuthHandler(consumer_key, consumer_secret) 
auth.set_access_token(access_token, access_token_secret) 
api = tweepy.API(auth)

# Input Twitter handles

Read, format, and sort Twitter handles that Dave generated

In [35]:
users = pd.read_csv('TwitterHandleOutput.csv', delimiter="\t")

In [36]:
users = (
    users
    .assign(
        URLS=users['TWITTERHANDLE'],
        TWITTERHANDLES=users['TWITTERHANDLE'].map(lambda val: list(set(re.findall(r"twitter.com/[@]?([^\s'?]+)'", val)))),
        TWITTERHANDLE=lambda df: df['TWITTERHANDLES'].map(lambda val: val[0] if val else None))
    .pipe(lambda df: df[~df["TWITTERHANDLE"].isna()])
)
users['TWITTERHANDLE']


0     GRERochesterBiz
4              myCMMC
8               mdibl
12         BangorYMCA
13            MDIYMCA
14            MDIYMCA
18       StMarysMaine
20          waynflete
Name: TWITTERHANDLE, dtype: object

In [41]:
unique = set(users["TWITTERHANDLE"].values)

In [42]:
unique = list(unique)
unique.sort()

In [43]:
len(unique)

7

# Create a loop to repeat stats across several users

Start with empty DatatFrames - individual users will be added to the 'new_set'

In [44]:
years = pd.DataFrame(columns = ('2012', '2013', '2014', '2015', '2016'))
new_set = pd.DataFrame(columns = ('user', 'text', 'year', '2012', '2013', '2014', '2015', '2016'))

Set up different lists for when I want to collect tweets for a specific user, group of specific users, or subset

In [45]:
user = ['AABGU']
group = ['Chicago100Club', '1bsa', '2harvest', '402Arts', '40DaysDallas', 
         '100plusrescue', '100Reporters', '211OrangeCounty', '4CforChildren']
sample = unique[31:41]

Set up empty DataFrames for everyone whom the loop will be run on 

In [46]:
d = {}
for name in sample:
    d[name] = pd.DataFrame()

#### Loop through all users in the list you chose ('user', 'group', or 'sample') - each new set of tweets from a specific user will be added to what's already been collected

A few notes about it:  
1 - It's hard to catch organizations that tweet a lot because Twitter limits the number of tweets you can collect for a user (3000, I think). So results for large organizations might be misleading.  
2 - Twitter also limits how many you can collect at once. The loop is set up to keep running so that the rate limit doesn't interrupt the loop - it just pauses it.  
3 - The queries are case-sensitive, and it's currently set up to only query "#GivingTuesday" (not "#givingtuesday"). That needs to be fixed.  
4 - Even though the query only goes back to 1/1/14, for some reason it's collecting tweets in 2012 and 2013  

In [49]:
for name, df in d.items():       
    search = [status._json for status in tweepy.Cursor(api.user_timeline, 
                                                           id=name,
                                                           since='2014-01-01', 
                                                           until='2016-12-31',
                                                           wait_on_rate_limit = True).items()]
    d[name] = pd.DataFrame(search)
    d[name] = d[name][d[name].text.str.contains('#GivingTuesday') == True]
    
    if d[name].empty:
        continue
    
    d[name]['date'] = pd.to_datetime(d[name]['created_at'])
    d[name]['year'] = d[name]['date'].dt.year
    d[name]['user'] = name
    
    d[name] = pd.concat([d[name]['user'], d[name]['text'], d[name]['year'], years], axis=1) 

    for i in range (2012, 2017):
        d[name][str(i)] = np.where(d[name]['year']==i, 1, 0)
        
    new_set = new_set.append(d[name], ignore_index=True)

In [19]:
new_set

Unnamed: 0,user,text,year,2012,2013,2014,2015,2016
0,AlbanyLaw,#GivingTuesday http://t.co/G77P0xXoZn http://t...,2014,0,0,1,0,0
1,AlbanyLaw,Embrace the spirit of giving this holiday seas...,2013,0,1,0,0,0
2,AlbanyMed,RT @GivingTues: We're inspired by all of the t...,2016,0,0,0,0,1
3,AlbanyMed,A BIG thanks those who chose to give to @Alban...,2016,0,0,0,0,1
4,AlbanyMed,Always wanted to give back to your community? ...,2016,0,0,0,0,1
5,AlbanyMed,It's #GivingTuesday. Please consider making a ...,2016,0,0,0,0,1
6,AlbanyMed,"RT @995TheRiver: For #GivingTuesday, why not b...",2016,0,0,0,0,1
7,AlbanyMed,RT @GivingTues: The big day is almost here! Jo...,2016,0,0,0,0,1
8,AlbanyMed,Tomorrow is #GivingTuesday! Instead of buying ...,2016,0,0,0,0,1
9,AlbanyMed,RT @GivingTues: November 29th is one week away...,2016,0,0,0,0,1


This isn't necessary, but I would periodically save the growing DataFrame to a new DataFrame ('big_data') just to make sure I have a "master" copy backed up

In [20]:
big_data = new_set

In [21]:
big_data.to_csv('DataDive.csv', index=False, header=True, encoding='utf-8') 

In [22]:
big_data

Unnamed: 0,user,text,year,2012,2013,2014,2015,2016
0,AlbanyLaw,#GivingTuesday http://t.co/G77P0xXoZn http://t...,2014,0,0,1,0,0
1,AlbanyLaw,Embrace the spirit of giving this holiday seas...,2013,0,1,0,0,0
2,AlbanyMed,RT @GivingTues: We're inspired by all of the t...,2016,0,0,0,0,1
3,AlbanyMed,A BIG thanks those who chose to give to @Alban...,2016,0,0,0,0,1
4,AlbanyMed,Always wanted to give back to your community? ...,2016,0,0,0,0,1
5,AlbanyMed,It's #GivingTuesday. Please consider making a ...,2016,0,0,0,0,1
6,AlbanyMed,"RT @995TheRiver: For #GivingTuesday, why not b...",2016,0,0,0,0,1
7,AlbanyMed,RT @GivingTues: The big day is almost here! Jo...,2016,0,0,0,0,1
8,AlbanyMed,Tomorrow is #GivingTuesday! Instead of buying ...,2016,0,0,0,0,1
9,AlbanyMed,RT @GivingTues: November 29th is one week away...,2016,0,0,0,0,1
