In [1]:
from datetime import datetime
from dateutil import parser

from nltk.corpus import stopwords
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import math
from collections import Counter
import numpy as np


import re

### Import Data

In [2]:
joe_df = pd.read_csv('../../data/joe.csv', parse_dates=True)#,index_col='date')
joe_df['date_time']= pd.to_datetime(joe_df['date'])
joe_df['date']= joe_df['date_time'].dt.date
joe_df.head()

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,...,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,date_time
0,0,1325885871875190784,1325885871875190784,1604951000000.0,2020-11-09,-500,,The bottom line: I will spare no effort to tur...,en,[],...,,,,,[],,,,,2020-11-09 14:40:00
1,1,1325880083618426881,1325880083618426881,1604949000000.0,2020-11-09,-500,,The challenge before us right now is still imm...,en,[],...,,,,,[],,,,,2020-11-09 14:17:00
2,2,1325873288711712769,1325873288711712769,1604948000000.0,2020-11-09,-500,,My COVID-19 Transition Advisory Board will adv...,en,[],...,,,,,[],,,,,2020-11-09 13:50:00
3,3,1325870017401905152,1325870017401905152,1604947000000.0,2020-11-09,-500,,"Today, I have named a COVID-19 Transition Advi...",en,[],...,,,,,[],,,,,2020-11-09 13:37:00
4,4,1325842292444291072,1325842292444291072,1604940000000.0,2020-11-09,-500,,I spent the morning with the co-chairs of my C...,en,[],...,,,,,[],,,,,2020-11-09 11:46:50


### Define Preprocessing Function

In [4]:
#remove urls
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

#filter stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(txt_list):
    filtered_txt_list = []
    for word in txt_list:
        if word not in stop_words:
            filtered_txt_list.append(word)
    return(filtered_txt_list)

#compute TF
def computeTF(x):
    wordList= x[0]
    wordFreq =x[1]
    tfDict = {}
    bagOfWordsCount = len(wordList)
    for word, count in wordFreq.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return Counter(tfDict)


#computer term_counter
def term_counter(txt_list):
    term_counts = {}
    for i in set(txt_list):
        term_counts[i] = txt_list.count(i)
    return(Counter(term_counts))

#return list of term counts for each tweet
def return_term_count_list(df, term_count_column = "term_count"):
    term_count_list = []
    for i in df[term_count_column]:
        term_count_list.append(dict(i))
    return(term_count_list)


#return copurs set for all tweets
def return_corpus_set(df, term_list_column = "term_list"):    
    tweet_corpus = []
    for i in df[term_list_column]:
        tweet_corpus += i
    corpus_set = set(tweet_corpus)
    return(corpus_set)

#computer IDF
def computeIDF(documents,corpus_set):
    N = len(documents)

    idfDict = dict.fromkeys(corpus_set, 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1


    for word, val in idfDict.items():
        idfDict[word] = math.log(N / (float(val))) #smoothing?
    return(Counter(idfDict))

#computes TFIDF
def computeTFIDF(TF,idfs):
    tfidf = {}
    for word, val in TF.items():
        tfidf[word] = val * idfs[word]
    return(Counter(tfidf))

In [6]:
def TFIDF_processing(df, startdate = "August 11, 2020"):
    df = df[["id","date_time","date","tweet"]]

    
    #filter data frame:
    startdate = parser.parse(str(startdate))
    df = df[df["date_time"] > startdate]
    df['term_list'] = df['tweet'].apply(lambda x: remove_url(x))
    df['term_list'] = df['term_list'].apply(lambda x: x.lower().split())
    df['term_list'] = df['term_list'].apply(lambda x: remove_stop_words(x))
    
    #calc TFs
    df['term_count'] = df['term_list'].apply(lambda x: term_counter(x))
    df["TF"] = df[['term_list','term_count']].apply(computeTF,axis=1)
    
    #return corpus and doc list
    term_count_list_df = return_term_count_list(df)
    corpus_set_df = return_corpus_set(df)
    
    #calc IDF and  TF-IDF
    df_idf = computeIDF(term_count_list_df,corpus_set_df)
    df["TFIDF"] = df["TF"].apply(lambda x: computeTFIDF(x, df_idf))
    
    return(df, df_idf, corpus_set_df)

In [7]:
joe_df, joe_idf,joe_corpus = TFIDF_processing(joe_df)
joe_df

Unnamed: 0,id,date_time,date,tweet,term_list,term_count,TF,TFIDF
0,1325885871875190784,2020-11-09 14:40:00,2020-11-09,The bottom line: I will spare no effort to tur...,"[bottom, line, spare, effort, turn, pandemic, ...","{'bottom': 1, 'spare': 1, 'pandemic': 1, 'line...","{'bottom': 0.14285714285714285, 'spare': 0.142...","{'bottom': 0.8630068638995443, 'spare': 1.0199..."
1,1325880083618426881,2020-11-09 14:17:00,2020-11-09,The challenge before us right now is still imm...,"[challenge, us, right, still, immense, growing...","{'us': 1, 'bold': 1, 'right': 1, 'growing': 1,...","{'us': 0.09090909090909091, 'bold': 0.09090909...","{'us': 0.19981981030505594, 'bold': 0.58604665..."
2,1325873288711712769,2020-11-09 13:50:00,2020-11-09,My COVID-19 Transition Advisory Board will adv...,"[covid19, transition, advisory, board, advise,...","{'covid19': 1, 'american': 1, 'every': 1, 'bed...","{'covid19': 0.058823529411764705, 'american': ...","{'covid19': 0.18425453827837932, 'american': 0..."
3,1325870017401905152,2020-11-09 13:37:00,2020-11-09,"Today, I have named a COVID-19 Transition Advi...","[today, named, covid19, transition, advisory, ...","{'place': 1, 'plan': 1, 'health': 1, 'public':...","{'place': 0.04, 'plan': 0.04, 'health': 0.04, ...","{'place': 0.18002412025398645, 'plan': 0.12831..."
4,1325842292444291072,2020-11-09 11:46:50,2020-11-09,I spent the morning with the co-chairs of my C...,"[spent, morning, cochairs, covid19, council, d...","{'covid19': 1, 'spent': 1, 'discussing': 1, 'c...","{'covid19': 0.0625, 'spent': 0.0625, 'discussi...","{'covid19': 0.19577044692077802, 'spent': 0.31..."
...,...,...,...,...,...,...,...,...
1256,1293340421444124672,2020-08-11 20:16:00,2020-08-11,"Let’s go win this, @KamalaHarris. https://t.c...","[lets, go, win, kamalaharris]","{'lets': 1, 'win': 1, 'kamalaharris': 1, 'go': 1}","{'lets': 0.25, 'win': 0.25, 'kamalaharris': 0....","{'lets': 0.8277547348689561, 'win': 0.81711483..."
1257,1293280412144267264,2020-08-11 16:17:32,2020-08-11,"Back when Kamala was Attorney General, she wor...","[back, kamala, attorney, general, worked, clos...","{'general': 1, 'im': 1, 'people': 1, 'working'...","{'general': 0.043478260869565216, 'im': 0.0434...","{'general': 0.24044445319699215, 'im': 0.11019..."
1258,1293280411150217219,2020-08-11 16:17:32,2020-08-11,I have the great honor to announce that I’ve p...,"[great, honor, announce, ive, picked, kamalaha...","{'servants': 1, 'picked': 1, 'ive': 1, 'mate':...","{'servants': 0.058823529411764705, 'picked': 0...","{'servants': 0.4199800197626423, 'picked': 0.3..."
1259,1293215600680951810,2020-08-11 12:00:00,2020-08-11,Don't forget that in the middle of this pandem...,"[dont, forget, middle, pandemic, trump, admini...","{'everyone': 1, 'build': 1, 'health': 1, 'midd...","{'everyone': 0.04, 'build': 0.04, 'health': 0....","{'everyone': 0.14695697732660773, 'build': 0.1..."


In [8]:
def sum_normalize_collections(counter_list):
    counter_sum = Counter()
    for counter in counter_list:
        counter_sum += counter
    for k,v in counter_sum.items():
        counter_sum[k] = v/len(counter_list)
    return(counter_sum.most_common())

In [9]:
group_by_series = joe_df.groupby("date")["TFIDF"].apply(lambda x:x.to_list())
group_by_series

date
2020-08-11    [{'us': 0.18316815944630127, 'thanks': 0.43281...
2020-08-12    [{'yesterday': 0.2976599290755932, 'ask': 0.28...
2020-08-13    [{'82': 3.56983016798246, 'days': 1.5661635753...
2020-08-14    [{'americas': 1.265054698571271, 'story': 2.67...
2020-08-15    [{'covid19': 0.2409482423640345, 'willing': 0....
                                    ...                        
2020-11-04    [{'thats': 0.14644060868421985, 'launched': 0....
2020-11-05    [{'going': 0.13779862706142598, 'donald': 0.08...
2020-11-06    [{'opponents': 1.6116282888512437, 'americans'...
2020-11-07    [{'bottom': 2.01368268243227, 'heart': 1.61235...
2020-11-09    [{'bottom': 0.8630068638995443, 'spare': 1.019...
Name: TFIDF, Length: 90, dtype: object

In [10]:
score_df = pd.DataFrame(joe_df.groupby("date")["TFIDF"].apply(lambda x: sum_normalize_collections(x.to_list())))
score_df

Unnamed: 0_level_0,TFIDF
date,Unnamed: 1_level_1
2020-08-11,"[(win, 0.18158107361427933), (go, 0.1437825367..."
2020-08-12,"[(kamalaharris, 0.13117880488284), (worried, 0..."
2020-08-13,"[(82, 0.39664779644249554), (days, 0.174018175..."
2020-08-14,"[(story, 0.2673950433368432), (security, 0.239..."
2020-08-15,"[(know, 0.11188818823996126), (vice, 0.0985908..."
...,...
2020-11-04,"[(count, 0.145789478930796), (victory, 0.13931..."
2020-11-05,"[(counted, 0.29949649183368193), (faith, 0.121..."
2020-11-06,"[(opponents, 0.2302326126930348), (enemies, 0...."
2020-11-07,"[(bottom, 0.12585516765201687), (heart, 0.1175..."


In [11]:
# Document Vectorization
def TFIDF_vecorization(df,corpus):
    D = np.zeros((len(df), len(corpus)))

    for i,tfidf in enumerate(df["TFIDF"]):
        for term in tfidf:
            ind = (list(corpus)).index(term)
            D[i][ind] = tfidf[term]
    return(D)

### Next Steps:
#### clean data:
-handle punctuation/concatenated words (first pass done)
-may pick specific work fileter? ( )
#### average TIDF scores by day
-vectorize TIDF dictionaries ? (wrote a function, not really ideal)

-sum TF-IDF vector with in a day, normalize to number of tweets with in that day (done)

-Return top 50 scores for that done ( )