### text_processing_all_PG-annotations.ipynb

In [1]:
from datetime import datetime
from dateutil import parser

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import itertools
import math

import re
import nltk
import os
from collections import Counter

In [2]:
## Get list of stopwords from NLTK package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pedrogalarza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Functions

### Cleaning and variable creation

In [3]:
def clean_tweets(df, startdate = 20200811, no_stops = True):

    ## Define dataframe and filter by date
    df = df[['id', 'date', 'tweet']]
    df = df[df.index > parser.parse(str(startdate))]
    
    ## Clean tweets
    clean_text = []

    for tweet in df.tweet:
        # Remove URLS
        tweet = re.sub("(\w+:\/\/\S+)", " ", tweet)
        # Replace non-characters with spaces
        tweet = re.sub("\W", " ", tweet)
        # Remove digits
        tweet = re.sub("[0-9]", "", tweet)
        # Remove extra spaces
        tweet = re.sub("\s+", " ", tweet)
        tweet = re.sub("^\s+", "", tweet)
        tweet = re.sub("\s+$", "", tweet)
            
        clean_text.append(tweet.lower())

    ## Add updated text to main data
    df['clean_text'] = clean_text
    df['term_list'] = df['clean_text'].apply(lambda x: x.split())
    
    ## Remove stop words from tweets
    if no_stops == True:
        stop_words = set(nltk.corpus.stopwords.words('english'))
        
        for term_list in df['term_list']:
            for term in term_list:
                if term in stop_words:
                    term_list.remove(term)
    
    return df

In [4]:
def computeTF(x):
    '''
    Purpose:
    -------
    Generates a counter object (dictionary like) associating a term present in a document with its normalized frequncy. Helper fucntion for the TF-IDF calculation.
    
    Inputs:
    ------
        x -           2-dimentional array like object containing the list of all terms in the 0-index and a collection object containing the term counts in the 1-index.

    '''
    wordFreq = x[1]
    wordList = x[0]
    bagOfWordsCount = len(wordList)
    
    tfDict = {}
    for word, count in wordFreq.items():
        tfDict[word] = count / float(bagOfWordsCount)
        
    return Counter(tfDict)

In [1]:
def term_counter(txt_list):
    '''
    Purpose:
    -------
    Generates and returns a counter object (dictionary-like) of the counts of each unique term from a list of terms.  The keys of the dictionary 
    are the unique terms and the values are the counts of that term.  Helper fucntion for the TF-IDF calculation.
    
    Inputs:
    ------
        x -           list of string values.

    '''
    
    
    
    term_counts = {}
    for i in set(txt_list):
        term_counts[i] = txt_list.count(i)
        
    return(Counter(term_counts))

In [3]:
def return_term_count_list(df, term_count_column = "term_count"):
    
    '''
    Purpose:
    -------
    Generates and returns a list of all frequncy count counter objects (dictionary like) from a data frame column containing frequency count objects. Helper fucntion for the TF-IDF calculation.
    
    Inputs:
    ------
        df -                    dataframe containing column with frequency counts as counter objects as entries.
        
        term_count_column -     name of columns containing frequency counts.
    '''
   
    term_count_list = []
    for i in df[term_count_column]:
        term_count_list.append(dict(i))
        
    return(term_count_list)

In [5]:
def return_corpus_set(df, term_list_column = "term_list"):

    '''
    Purpose:
    -------
    Generates and returns a set of all unique terms found across the union of all documnets (the corpus). Helper fucntion for the TF-IDF calculation.
    
    Inputs:
    ------
        df -                    dataframe containing column with frequency counts as counter objects as entries.
        
        term_count_column -     name of column containing frequency counts.
    '''
    
    tweet_corpus = []
    for i in df[term_list_column]:
        tweet_corpus += i

    return(set(tweet_corpus))

In [5]:
def computeIDF(documents, corpus_set):
    
    '''
    Purpose:
    -------
    Generates the inverse document frequency score (IDF) for all terms in the corpus. Helper fucntion for the TF-IDF calculation.
    
    Inputs:
    ------
        documents -      list of counter objects containing the frequency counts for each document.
        
        corpus_set -     set of all unique terms found across the union of all documnets (the corpus).
    '''
    
    N = len(documents)

    idfDict = dict.fromkeys(corpus_set, 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(N / (float(val)))
        
    return(Counter(idfDict))

In [6]:
def computeTFIDF(TF,idfs):
    
    '''
    Purpose:
    -------
    Generates the term frequency/inverse document frequency (TF-IDF score) for each unique term in a document.
    
    Inputs:
    ------
        TF -      Counter object containing the term frequncy values for each document
        
        idfs -    Counter object containing the inverse document frequency score (IDF) for all temrs unqiue in the corpus.
    '''
    
    tfidf = {}
    for word, val in TF.items():
        tfidf[word] = val * idfs[word]
        
    return(Counter(tfidf))

In [7]:
def TFIDF_processing(df):
    
    '''
    Purpose:
    -------
    Uses the associated helper functions to calculate the TF-IDF scores for a dataframe of documents.
    Returns a 3-dimentional array containing:
        - [0] a data frame with the TF-IDF scores, TF scores, term-list, processed document, and term cou,t.
        - [1] the idf scores for all terms in the corpus
        - [2] the corpus of the document
    
    Inputs:
    ------
        df -      dataframe of processed documents,
    '''
    
    df = clean_tweets(df)

    # Create dictionary of term counts for each tweet
    df['term_count'] = df['term_list'].apply(lambda x: term_counter(x))
    
    # Calculate term frequency (TF)
    df['TF'] = df[['term_list', 'term_count']].apply(computeTF, axis = 1)
    
    # Return corpus and doc list
    term_count_list_df = return_term_count_list(df)
    corpus_set_df = return_corpus_set(df)
    
    # Calculate IDF and  TF-IDF
    df_idf = computeIDF(term_count_list_df, corpus_set_df)
    df['TFIDF'] = df['TF'].apply(lambda x: computeTFIDF(x, df_idf))
    
    return(df, df_idf, corpus_set_df)

### TFIDF organization

In [8]:
def sum_normalize_collections(counter_list):
    '''
    Purpose:
    -------
    Generates an aggregate tfidf score counter object from a list of counter objects.  Takes the sum of all TF_IDF scores with respect to term objects
    and then divides by the number of dictionaries in the list.
    
    Inputs:
    ------
        counter_list -     list of counter objects that contain TF-IDF scores.
    '''
    
    
    counter_sum = Counter()
    for counter in counter_list:
        counter_sum += counter
        
    for k,v in counter_sum.items():
        counter_sum[k] = v/len(counter_list)
        
    return(counter_sum.most_common())

In [10]:
def TFIDF_vecorization(df,corpus):
    '''
    Purpose:
    -------
    Vecotrizes the tf-idf scores.  Not advised.
    
    Inputs:
    ------
        df -      dataframe with tf-idf scores
        corpus -  set of unique words across all documents.
    '''
    
    
    D = np.zeros((len(df), len(corpus)))

    for i,tfidf in enumerate(df["TFIDF"]):
        for term in tfidf:
            ind = (list(corpus)).index(term)
            D[i][ind] = tfidf[term]
            
    return(D)

---

# Data

## Twitter Data

In [152]:
## Import Biden data
biden_df = pd.read_csv('../../data/biden.csv', index_col = 'date', parse_dates = True)
biden_df.index.rename("datetime", inplace = True)

## Date variables
biden_df['date'] = biden_df.index.date
biden_df['month'] = biden_df.index.month

biden_df.head()

Unnamed: 0_level_0,Unnamed: 0,id,conversation_id,created_at,timezone,place,tweet,language,hashtags,cashtags,...,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,date,month
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-11-09 14:40:00,0,1325885871875190784,1325885871875190784,1604951000000.0,-500,,The bottom line: I will spare no effort to tur...,en,[],[],...,,,,[],,,,,2020-11-09,11
2020-11-09 14:17:00,1,1325880083618426881,1325880083618426881,1604949000000.0,-500,,The challenge before us right now is still imm...,en,[],[],...,,,,[],,,,,2020-11-09,11
2020-11-09 13:50:00,2,1325873288711712769,1325873288711712769,1604948000000.0,-500,,My COVID-19 Transition Advisory Board will adv...,en,[],[],...,,,,[],,,,,2020-11-09,11
2020-11-09 13:37:00,3,1325870017401905152,1325870017401905152,1604947000000.0,-500,,"Today, I have named a COVID-19 Transition Advi...",en,[],[],...,,,,[],,,,,2020-11-09,11
2020-11-09 11:46:50,4,1325842292444291072,1325842292444291072,1604940000000.0,-500,,I spent the morning with the co-chairs of my C...,en,[],[],...,,,,[],,,,,2020-11-09,11


In [153]:
## Import Biden data
trump_df = pd.read_csv('../../data/trump.csv', index_col = 'date', parse_dates = True)
trump_df.index.rename("datetime", inplace = True)

## Date variables
trump_df['date'] = trump_df.index.date
trump_df['month'] = trump_df.index.month

## Match variable names to Biden
trump_df.rename(columns = {'text' : 'tweet'}, inplace = True)

trump_df.head()

Unnamed: 0_level_0,id,tweet,isRetweet,isDeleted,device,favorites,retweets,date,month
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-08-02 18:07:48,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02,8
2020-03-03 01:34:50,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03,3
2020-01-17 03:22:47,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17,1
2020-09-12 20:10:58,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12,9
2020-01-17 13:13:59,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17,1


### Process Twitter data

In [154]:
## Calculate TFIDFs for Biden
biden_df, biden_idf,biden_corpus = TFIDF_processing(biden_df)
biden_df.head()

Unnamed: 0_level_0,id,date,tweet,clean_text,term_list,term_count,TF,TFIDF
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-11-09 14:40:00,1325885871875190784,2020-11-09,The bottom line: I will spare no effort to tur...,the bottom line i will spare no effort to turn...,"[bottom, line, will, spare, effort, turn, pand...","{'effort': 1, 'line': 1, 'around': 1, 'will': ...","{'effort': 0.125, 'line': 0.125, 'around': 0.1...","{'effort': 0.7191707468556287, 'line': 0.56257..."
2020-11-09 14:17:00,1325880083618426881,2020-11-09,The challenge before us right now is still imm...,the challenge before us right now is still imm...,"[challenge, us, right, still, immense, growing...","{'challenge': 1, 'is': 1, 'the': 1, 'action': ...","{'challenge': 0.07142857142857142, 'is': 0.071...","{'challenge': 0.37098215620782904, 'is': 0.186..."
2020-11-09 13:50:00,1325873288711712769,2020-11-09,My COVID-19 Transition Advisory Board will adv...,my covid transition advisory board will advise...,"[covid, transition, advisory, board, advise, d...","{'a': 1, 'empathy': 1, 'compassion': 1, 'trans...","{'a': 0.05, 'empathy': 0.05, 'compassion': 0.0...","{'a': 0.08845111539186284, 'empathy': 0.302052..."
2020-11-09 13:37:00,1325870017401905152,2020-11-09,"Today, I have named a COVID-19 Transition Advi...",today i have named a covid transition advisory...,"[today, have, named, covid, transition, adviso...","{'public': 1, 'a': 1, 'comprised': 1, 'transit...","{'public': 0.03333333333333333, 'a': 0.0333333...","{'public': 0.1365045966080499, 'a': 0.05896741..."
2020-11-09 11:46:50,1325842292444291072,2020-11-09,I spent the morning with the co-chairs of my C...,i spent the morning with the co chairs of my c...,"[spent, morning, co, chairs, my, covid, counci...","{'status': 1, 'chairs': 1, 'beat': 1, 're': 1,...","{'status': 0.041666666666666664, 'chairs': 0.0...","{'status': 0.2686047148085406, 'chairs': 0.268..."


In [160]:
## Calculate TFIDF scores by date for Biden
biden_by_date = biden_df.groupby("date")["TFIDF"].apply(lambda x:x.to_list())

score_df_biden = pd.DataFrame(biden_df.groupby("date")["TFIDF"].apply(lambda x: sum_normalize_collections(x.to_list())))

score_df_biden.head()

date
2020-08-11    [{'track': 0.2859194361564614, 'on': 0.1992760...
2020-08-12    [{'ask': 0.2418537621485437, 'mate': 0.2596875...
2020-08-13    [{'days': 3.1323271507324484}, {'for': 0.21567...
2020-08-14    [{'america': 0.5778366496656546, 'kamalaharris...
2020-08-15    [{'challenge': 0.3246093866818504, 'president'...
Name: TFIDF, dtype: object

In [155]:
## Calculate TFIDFs for Trump
trump_df, trump_idf, trump_corpus = TFIDF_processing(trump_df)
trump_df.head()

Unnamed: 0_level_0,id,date,tweet,clean_text,term_list,term_count,TF,TFIDF
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-09-12 20:10:58,1304875170860015617,2020-09-12,The Unsolicited Mail In Ballot Scam is a major...,the unsolicited mail in ballot scam is a major...,"[unsolicited, mail, ballot, scam, major, threa...","{'a': 1, 'ballot': 1, 'fraud': 1, 'smaller': 1...","{'a': 0.02857142857142857, 'ballot': 0.0285714...","{'a': 0.06206252854785413, 'ballot': 0.1258794..."
2020-10-23 22:04:14,1319761576996573186,2020-10-23,THANK YOU to all of the Great American Patriot...,thank you to all of the great american patriot...,"[thank, to, of, great, american, patriots, the...","{'great': 1, 'patriots': 1, 'american': 1, 'th...","{'great': 0.1, 'patriots': 0.1, 'american': 0....","{'great': 0.24709204078132607, 'patriots': 0.5..."
2020-10-12 22:22:39,1315779944002199552,2020-10-12,"“I’m running as a proud Democrat, for the Sena...",i m running as a proud democrat for the senate...,"[m, running, a, proud, democrat, the, senate, ...","{'a': 1, 's': 1, 'm': 1, 'sleepy': 1, 'china':...","{'a': 0.045454545454545456, 's': 0.04545454545...","{'a': 0.09873584087158611, 's': 0.153130090955..."
2020-10-23 19:50:48,1319727996882702336,2020-10-23,https://t.co/LCQcdlRkhz,,[],{},{},{}
2020-10-23 19:49:55,1319727773234069505,2020-10-23,https://t.co/4V7nu5hh8V,,[],{},{},{}


In [161]:
## Calculate TFIDF scores by date for Trump
trump_by_date = trump_df.groupby("date")["TFIDF"].apply(lambda x:x.to_list())

score_df_trump = pd.DataFrame(trump_df.groupby("date")["TFIDF"].apply(lambda x: sum_normalize_collections(x.to_list())))

score_df_trump.head()

date
2020-08-11    [{'p': 1.1284985763305553, 'enjoy': 1.15883879...
2020-08-12    [{'neworleansrta': 0.36678828940052893, 'usdot...
2020-08-13    [{}, {'realdonaldtrump': 1.1366423080231316, '...
2020-08-14    [{'buds': 0.4097886214584272, 'dimensional': 0...
2020-08-15    [{}, {'for': 0.3350592538544098, 'moments': 0....
Name: TFIDF, dtype: object

score_df_joe.to_csv('../../data/biden_tweet_scores.csv')
score_df_donald.to_csv('../../data/trump_tweet_scores.csv')

## Polling Data

In [165]:
# Read in data, select relevant cols
ge = pd.read_csv('../data/president_polls.csv')
ge = ge[['poll_id', 'fte_grade', 'sample_size', 'start_date', 'answer', 'pct']]

FileNotFoundError: [Errno 2] File ../data/president_polls.csv does not exist: '../data/president_polls.csv'

In [None]:
# Only use good polls (according to 538)
ge = ge[ge['fte_grade'].isin(['A+', 'A', 'A-', 'A/B', 'B+', 'B'])]
ge.drop('fte_grade', axis=1, inplace=True)

# Set index to date
ge['start_date'] = pd.to_datetime(ge['start_date'])
ge.set_index('start_date', inplace=True)
ge.sort_index(inplace=True)

In [None]:
# Check to see the latest date with primary candidates still.
ge[ge['answer'] == 'Sanders']

In [None]:
# Drop all primary dates, only need Biden and Trump
ge = ge[ge.index > '2020-04-06']
ge = ge[ge.answer.isin(['Biden', 'Trump'])]

In [None]:
# Use a pivot table to get Biden and Trump pct in the same row
runoff = ge.pivot_table('pct', ['start_date', 'poll_id', 'sample_size'], 'answer').reset_index().set_index('start_date')
runoff

In [None]:

# Weight percentages by sample sizes
runoff['total_biden'] = runoff['Biden'] * runoff['sample_size']
runoff['total_trump'] = runoff['Trump'] * runoff['sample_size']
overall_runoff = runoff.groupby('start_date')[['sample_size', 'total_biden', 'total_trump']].sum().sort_index()
overall_runoff['w_biden'] = overall_runoff['total_biden'] / overall_runoff['sample_size']
overall_runoff['w_trump'] = overall_runoff['total_trump'] / overall_runoff['sample_size']