In [1]:
from datetime import datetime
from dateutil import parser

from nltk.corpus import stopwords
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import math
from collections import Counter
import numpy as np


import re

### Import Data

In [18]:
joe_df = pd.read_csv('../../data/joe.csv', parse_dates=True)#,index_col='date')
joe_df['date_time']= pd.to_datetime(joe_df['date'])
joe_df['date']= joe_df['date_time'].dt.date
joe_df.head()
donald_df = pd.read_csv('../../data/donald.csv', parse_dates=True)
donald_df['date_time']= pd.to_datetime(donald_df['date'])
donald_df['date']= donald_df['date_time'].dt.date
donald_df.rename(columns={'text': 'tweet'}, inplace=True)
donald_df.head()

Unnamed: 0,id,tweet,isRetweet,isDeleted,device,favorites,retweets,date,date_time
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02,2011-08-02 18:07:48
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03,2020-03-03 01:34:50
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17,2020-01-17 03:22:47
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12,2020-09-12 20:10:58
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17,2020-01-17 13:13:59


### Define Preprocessing Function

In [4]:
#remove urls
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

#filter stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(txt_list):
    filtered_txt_list = []
    for word in txt_list:
        if word not in stop_words:
            filtered_txt_list.append(word)
    return(filtered_txt_list)

#compute TF
def computeTF(x):
    wordList= x[0]
    wordFreq =x[1]
    tfDict = {}
    bagOfWordsCount = len(wordList)
    for word, count in wordFreq.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return Counter(tfDict)


#computer term_counter
def term_counter(txt_list):
    term_counts = {}
    for i in set(txt_list):
        term_counts[i] = txt_list.count(i)
    return(Counter(term_counts))

#return list of term counts for each tweet
def return_term_count_list(df, term_count_column = "term_count"):
    term_count_list = []
    for i in df[term_count_column]:
        term_count_list.append(dict(i))
    return(term_count_list)


#return copurs set for all tweets
def return_corpus_set(df, term_list_column = "term_list"):    
    tweet_corpus = []
    for i in df[term_list_column]:
        tweet_corpus += i
    corpus_set = set(tweet_corpus)
    return(corpus_set)

#computer IDF
def computeIDF(documents,corpus_set):
    N = len(documents)

    idfDict = dict.fromkeys(corpus_set, 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1


    for word, val in idfDict.items():
        idfDict[word] = math.log(N / (float(val))) #smoothing?
    return(Counter(idfDict))

#computes TFIDF
def computeTFIDF(TF,idfs):
    tfidf = {}
    for word, val in TF.items():
        tfidf[word] = val * idfs[word]
    return(Counter(tfidf))

In [6]:
def TFIDF_processing(df, startdate = "August 11, 2020"):
    df = df[["id","date_time","date","tweet"]]

    
    #filter data frame:
    startdate = parser.parse(str(startdate))
    df = df[df["date_time"] > startdate]
    df['term_list'] = df['tweet'].apply(lambda x: remove_url(x))
    df['term_list'] = df['term_list'].apply(lambda x: x.lower().split())
    df['term_list'] = df['term_list'].apply(lambda x: remove_stop_words(x))
    
    #calc TFs
    df['term_count'] = df['term_list'].apply(lambda x: term_counter(x))
    df["TF"] = df[['term_list','term_count']].apply(computeTF,axis=1)
    
    #return corpus and doc list
    term_count_list_df = return_term_count_list(df)
    corpus_set_df = return_corpus_set(df)
    
    #calc IDF and  TF-IDF
    df_idf = computeIDF(term_count_list_df,corpus_set_df)
    df["TFIDF"] = df["TF"].apply(lambda x: computeTFIDF(x, df_idf))
    
    return(df, df_idf, corpus_set_df)

In [19]:
joe_df, joe_idf,joe_corpus = TFIDF_processing(joe_df)
joe_df
donald_df, donald_idf, donald_corpus = TFIDF_processing(donald_df)
donald_df

Unnamed: 0,id,date_time,date,tweet,term_list,term_count,TF,TFIDF
3,1304875170860015617,2020-09-12 20:10:58,2020-09-12,The Unsolicited Mail In Ballot Scam is a major...,"[unsolicited, mail, ballot, scam, major, threa...","{'elections': 1, 'threat': 1, 'recent': 1, 'ba...","{'elections': 0.03125, 'threat': 0.03125, 'rec...","{'elections': 0.1961744655494869, 'threat': 0...."
5,1319761576996573186,2020-10-23 22:04:14,2020-10-23,THANK YOU to all of the Great American Patriot...,"[thank, great, american, patriots, villages, f...","{'american': 1, 'maga': 1, 'great': 1, 'villag...","{'american': 0.14285714285714285, 'maga': 0.14...","{'american': 0.505248981951197, 'maga': 0.4934..."
7,1315779944002199552,2020-10-12 22:22:39,2020-10-12,"“I’m running as a proud Democrat, for the Sena...","[im, running, proud, democrat, senate, sleepy,...","{'county': 1, 'us': 1, 'sleepy': 1, 'proud': 1...","{'county': 0.0625, 'us': 0.0625, 'sleepy': 0.0...","{'county': 0.354465443375829, 'us': 0.22379412..."
9,1319727996882702336,2020-10-23 19:50:48,2020-10-23,https://t.co/LCQcdlRkhz,[],{},{},{}
10,1319727773234069505,2020-10-23 19:49:55,2020-10-23,https://t.co/4V7nu5hh8V,[],{},{},{}
...,...,...,...,...,...,...,...,...
54437,1319484210101379072,2020-10-23 03:42:05,2020-10-23,RT @EliseStefanik: President @realDonaldTrump ...,"[rt, elisestefanik, president, realdonaldtrump...","{'dc': 1, 'excels': 1, 'b': 1, 'american': 1, ...","{'dc': 0.07142857142857142, 'excels': 0.071428...","{'dc': 0.4119112338441135, 'excels': 0.5763815..."
54438,1319444420861829121,2020-10-23 01:03:58,2020-10-23,RT @TeamTrump: LIVE: Presidential Debate #Deba...,"[rt, teamtrump, live, presidential, debate, de...","{'presidential': 1, 'vote': 1, 'teamtrump': 1,...","{'presidential': 0.125, 'vote': 0.125, 'teamtr...","{'presidential': 0.6014057285987692, 'vote': 0..."
54439,1319384118849949702,2020-10-22 21:04:21,2020-10-22,Just signed an order to support the workers of...,"[signed, order, support, workers, delphi, corp...","{'workers': 5, 'american': 3, 'failed': 2, 'de...","{'workers': 0.20833333333333334, 'american': 0...","{'workers': 1.18155147791943, 'american': 0.44..."
54440,1319345719829008387,2020-10-22 18:31:46,2020-10-22,Suburban women want Safety &amp; Security. Joe...,"[suburban, women, want, safety, amp, security,...","{'cannot': 1, 'suburban': 1, 'security': 1, 'g...","{'cannot': 0.09090909090909091, 'suburban': 0....","{'cannot': 0.5003993644863727, 'suburban': 0.5..."


In [8]:
def sum_normalize_collections(counter_list):
    counter_sum = Counter()
    for counter in counter_list:
        counter_sum += counter
    for k,v in counter_sum.items():
        counter_sum[k] = v/len(counter_list)
    return(counter_sum.most_common())

In [21]:
group_by_series_joe = joe_df.groupby("date")["TFIDF"].apply(lambda x:x.to_list())
group_by_series_joe
group_by_series_donald = donald_df.groupby("date")["TFIDF"].apply(lambda x:x.to_list())
group_by_series_donald

date
2020-08-11    [{'enjoy': 0.9404093073650324, 'pm': 0.9336289...
2020-08-12    [{'louisiana': 0.2659255103344588, 'usdot': 0....
2020-08-13    [{}, {'rt': 0.39743140401888216, 'realdonaldtr...
2020-08-14    [{'trump': 0.13446489603613496, 'plays': 0.819...
2020-08-15    [{}, {'viral': 0.672445197234303, 'saturday': ...
                                    ...                        
2020-10-20    [{'800': 1.4974752062829497, 'enjoy': 1.175511...
2020-10-21    [{'watch': 0.8198100906519028, 'rt': 0.1589725...
2020-10-22    [{'thundering': 0.3362225986171515, 'deliverin...
2020-10-23    [{'american': 0.505248981951197, 'maga': 0.493...
2020-10-24    [{'people': 0.2410993861864838, 'showing': 0.4...
Name: TFIDF, Length: 75, dtype: object

In [22]:
score_df_joe = pd.DataFrame(joe_df.groupby("date")["TFIDF"].apply(lambda x: sum_normalize_collections(x.to_list())))
score_df_joe
score_df_donald = pd.DataFrame(donald_df.groupby("date")["TFIDF"].apply(lambda x: sum_normalize_collections(x.to_list())))
score_df_donald

Unnamed: 0_level_0,TFIDF
date,Unnamed: 1_level_1
2020-08-11,"[(pm, 0.142637763435123), (conference, 0.12554..."
2020-08-12,"[(bus, 0.20821696372701812), (usdot, 0.1770292..."
2020-08-13,"[(realdonaldtrump, 0.123663901762581), (champi..."
2020-08-14,"[(thank, 0.06817100049854213), (rt, 0.05917441..."
2020-08-15,"[(lizrnc, 0.16370813934801895), (terrible, 0.1..."
...,...
2020-10-20,"[(underwaylets, 0.2017335591702909), (800, 0.1..."
2020-10-21,"[(eric, 0.07069350109891419), (pennsylvania, 0..."
2020-10-22,"[(depression, 0.24144549606090696), (virginia,..."
2020-10-23,"[(erictrump, 0.16552355139287045), (joebiden, ..."


In [11]:
# Document Vectorization
def TFIDF_vecorization(df,corpus):
    D = np.zeros((len(df), len(corpus)))

    for i,tfidf in enumerate(df["TFIDF"]):
        for term in tfidf:
            ind = (list(corpus)).index(term)
            D[i][ind] = tfidf[term]
    return(D)

In [23]:
score_df_joe.to_csv('../../data/biden_tweet_scores.csv')
score_df_donald.to_csv('../../data/trump_tweet_scores.csv')

### Next Steps:
#### clean data:
-handle punctuation/concatenated words (first pass done)
-may pick specific work fileter? ( )
#### average TIDF scores by day
-vectorize TIDF dictionaries ? (wrote a function, not really ideal)

-sum TF-IDF vector with in a day, normalize to number of tweets with in that day (done)

-Return top 50 scores for that done ( )