In [1]:
import re
import time

import numpy as np
import pandas as pd
import requests

In [2]:
!pip install progressbar2
from progressbar import progressbar



## Scraping

### Scraping Functions

In [3]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

def scraper_bike(url):
    headers = {'User-Agent' : 'override this bad boy!'}
    posts = []
    after = {}

    for page in progressbar(range(40)):
        params = {'after': after}
        pagepull = requests.get(url=url, params=params, headers=headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        time.sleep(.2)
        
    return posts

In [4]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)


def posts_to_df(post_list):
    i = 0
    post_dict = {}

    for post in post_list:
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]
        i += 1

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext']

    return df_name

In [5]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

## Run Scrape

In [6]:
# You can also put in any 2 subreddits in as the URL and get results for those

nfltest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nfl.json')
nbatest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nba.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:22 Time:  0:00:22
100% (40 of 40) |########################| Elapsed Time: 0:00:24 Time:  0:00:24


In [7]:
politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (40 of 40) |########################| Elapsed Time: 0:00:22 Time:  0:00:22


In [8]:
nbatest.shape

(824, 3)

In [9]:
nfltest.head()

Unnamed: 0,subreddit,title,selftext
t3_jcche7,nfl,Free Talk Friday,"Welcome to today's open thread, where /r/nfl u..."
t3_jb73pb,nfl,Official r/NFL Week 5 Power Rankings,Welcome to the week 5 **Official R/NFL Power R...
t3_jc7m8n,nfl,"[Caplan] #Colts statement: This morning, we we...",
t3_jcarxf,nfl,More positive covid tests to be announced late...,Have an inside source - mods can delete if tha...
t3_jc9955,nfl,Ex-Washington cheerleaders shaken by lewd vide...,


In [10]:
nfltest.shape

(870, 3)

### Data Cleaning / Preprocessing

In [11]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

pd.set_option('max_colwidth', 300)

In [12]:
# drop column

nfltest = nfltest.drop(columns='selftext')
nbatest = nbatest.drop(columns='selftext')

In [13]:
# merge subreddit data

train = pd.concat([nfltest, nbatest])

In [14]:
train

Unnamed: 0,subreddit,title
t3_jcche7,nfl,Free Talk Friday
t3_jb73pb,nfl,Official r/NFL Week 5 Power Rankings
t3_jc7m8n,nfl,"[Caplan] #Colts statement: This morning, we were informed that several individuals within our organization have tested posted for COVID-19. The team is currently in the process of confirming those tests...."
t3_jcarxf,nfl,More positive covid tests to be announced later today for Patriots
t3_jc9955,nfl,Ex-Washington cheerleaders shaken by lewd videos: ‘I don’t think they viewed us as people’
...,...,...
t3_ja28tc,nba,"Aside from Udonis Haslem, all of Lebron’s teammates from his first ring are either retired or out of the league"
t3_ja87f7,nba,"KD on if he took the easy way out by joining the Warriors: “No . . I go through every rep at 100 percent speed . . . I played at an elite level in the Finals at all the biggest moments . . . I felt like I got up every day and held myself to a championship, elite player standard . . . I earned th..."
t3_jb6spu,nba,Genuine question: Do Lebron’s 6 finals losses not show that he would not have as much success in his career had he played in the west?
t3_jab7zg,nba,"Who had the better defense, the 2020 Lakers or the 2019 raptors?"


##### Tokenize (grab only word characters)

In [15]:
word_tokenizer = RegexpTokenizer(r'\w+')

In [16]:
print(r'Hello\nWorld')
print('Hello\nWorld')

Hello\nWorld
Hello
World


Word tokenize

In [17]:
train = pd.concat([nfltest, nbatest])


In [18]:
train['title'] = train['title'].map(lambda x: ' '.join(word_tokenizer.tokenize(x.lower())))

In [19]:
train['title'][0:5]

t3_jcche7                                                                                                                                                                                        free talk friday
t3_jb73pb                                                                                                                                                                    official r nfl week 5 power rankings
t3_jc7m8n    caplan colts statement this morning we were informed that several individuals within our organization have tested posted for covid 19 the team is currently in the process of confirming those tests
t3_jcarxf                                                                                                                                      more positive covid tests to be announced later today for patriots
t3_jc9955                                                                                                                 ex washington cheerleaders shaken by l

With TweetTokenizer

In [20]:
tknzr = TweetTokenizer()

In [21]:
train['title'] = train['title'].map(lambda x: tknzr.tokenize(x))

In [22]:
train['title']

t3_jcche7                                                                                                                                                                                                                                                                                           [free, talk, friday]
t3_jb73pb                                                                                                                                                                                                                                                                   [official, r, nfl, week, 5, power, rankings]
t3_jc7m8n                                                                           [caplan, colts, statement, this, morning, we, were, informed, that, several, individuals, within, our, organization, have, tested, posted, for, covid, 19, the, team, is, currently, in, the, process, of, confirming, those, tests]
t3_jcarxf                                                    

In [23]:
# rejoin list of tokenized words into single string for each row

train['title'] = train['title'].map(lambda x: ' '.join(x))

In [24]:
train['title'][0:5]

t3_jcche7                                                                                                                                                                                        free talk friday
t3_jb73pb                                                                                                                                                                    official r nfl week 5 power rankings
t3_jc7m8n    caplan colts statement this morning we were informed that several individuals within our organization have tested posted for covid 19 the team is currently in the process of confirming those tests
t3_jcarxf                                                                                                                                      more positive covid tests to be announced later today for patriots
t3_jc9955                                                                                                                 ex washington cheerleaders shaken by l

### Train test split and converting series to list of strings then to array

In [25]:
X = train[['title']]
y = train['subreddit']

In [26]:
X

Unnamed: 0,title
t3_jcche7,free talk friday
t3_jb73pb,official r nfl week 5 power rankings
t3_jc7m8n,caplan colts statement this morning we were informed that several individuals within our organization have tested posted for covid 19 the team is currently in the process of confirming those tests
t3_jcarxf,more positive covid tests to be announced later today for patriots
t3_jc9955,ex washington cheerleaders shaken by lewd videos i don t think they viewed us as people
...,...
t3_ja28tc,aside from udonis haslem all of lebron s teammates from his first ring are either retired or out of the league
t3_ja87f7,kd on if he took the easy way out by joining the warriors no i go through every rep at 100 percent speed i played at an elite level in the finals at all the biggest moments i felt like i got up every day and held myself to a championship elite player standard i earned that
t3_jb6spu,genuine question do lebron s 6 finals losses not show that he would not have as much success in his career had he played in the west
t3_jab7zg,who had the better defense the 2020 lakers or the 2019 raptors


In [27]:
y.value_counts(normalize=True)

nfl    0.513577
nba    0.486423
Name: subreddit, dtype: float64

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42,
                                                    stratify=y)

In [29]:
# baseline is

y.value_counts(normalize=True)

nfl    0.513577
nba    0.486423
Name: subreddit, dtype: float64

In [30]:
y_train.value_counts(normalize=True)

nfl    0.513386
nba    0.486614
Name: subreddit, dtype: float64

In [31]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)

In [32]:
len(clean_train_data)

1270

In [33]:
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [34]:
len(clean_test_data)

424

In [35]:
clean_train_data

['mikeklis melvin gordon not practicing i m told he reported to broncos headquarters this morning and met with vic fangio the coach sent his starting rb home for the day probably for best given circumstances',
 'britton jim nantz on bullandfox i see a fearlessness in watching the browns on tape the thing i like about them the best there s a fearlessness about them there s a swag there s an attitude there s something different about this team there s a mojo an attitude a personality',
 'how the manning family is helping bengals joe burrow transition to nfl archie texts me every week and that means a lot to not only me but my family as well',
 'highlight marshon lattimore stops mike williams just short of the first down marker on a critical 4th down',
 'bryant mcfadden man i m about ready to suit up again after hearing coachtomlin break down his famous sayings',
 'bradley beal on spending his whole career with the wizards that would mean the world man i m a loyal guy i want to be here i 

### Count Vectorizer

In [36]:
# instantiate our CountVectorizer. This counts the number of appearances of all the words in our training data and
# eliminates common english stop words. 5000 max features works well for our purposes (tested various numbers). Our
# data is already preprocessed and tokenized manually earlier. ngram_range is 1,3, although all or nearly all our
# features are single words

vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
#                              max_df= 0.9,
#                              min_df= 0.001,
                             ngram_range=(1, 3))

In [37]:
# fit our training data and test data lists to our count_vectorizer

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

In [38]:
train_data_features

<1270x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 16117 stored elements in Compressed Sparse Row format>

In [39]:
16670 / (1223*5000)

0.0027260834014717905

In [40]:
# convert to array

train_data_features = train_data_features.toarray()

In [41]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [42]:
pd.DataFrame(train_data_features, columns=vectorizer.get_feature_names())

Unnamed: 0,00,000,000 000,02,03,04,05,06,07,077,...,york,young,young core,ypc,zac,zac taylor,zach,zach ertz,zimmer,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1268,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
train_data_features.shape

(1270, 5000)

In [44]:
# check shapes

train_data_features.shape, test_data_features.shape

((1270, 5000), (424, 5000))

In [45]:
# I wanted check that the features corpus was as expected - removed print statement for readability

vocab = vectorizer.get_feature_names()

In [46]:
vocab

['00',
 '000',
 '000 000',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '077',
 '077 yards',
 '10',
 '10 years',
 '100',
 '100 percent',
 '1053thefan',
 '11',
 '12',
 '12 tds',
 '12th',
 '13',
 '13 score',
 '13 score games',
 '14',
 '14 starts',
 '15',
 '15 years',
 '15 years ago',
 '158',
 '16',
 '16 games',
 '17',
 '17 games',
 '18',
 '18th',
 '19',
 '19 list',
 '19 protocols',
 '19 reserve',
 '19 reserve list',
 '19 test',
 '19 transmission',
 '1970',
 '1975',
 '1981',
 '1984',
 '1998',
 '1999',
 '1m',
 '1m roster',
 '1m roster bonus',
 '1st',
 '1st round',
 '20',
 '20 000',
 '20 season',
 '200',
 '2000',
 '2000s',
 '2002',
 '2003',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2019 20',
 '2019 20 season',
 '2019 kyler',
 '2019 kyler murray',
 '2019 season',
 '2020',
 '2020 2021',
 '2020 2021 season',
 '2020 21',
 '2020 lakers',
 '2020 nba',
 '2020 nba draft',
 '2020 nba season',
 '2020 season',
 '2020 state',
 '2021',
 '

## MODELING

### Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
# fit logistic regression model

lr = LogisticRegression()

In [49]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [50]:
lr.score(train_data_features, y_train)

0.9952755905511811

In [51]:
lr.score(test_data_features, y_test)

0.9198113207547169

### Feature comparison

Creates a dataframe that matches features to coefficients

In [52]:
coef_list = lr.coef_.tolist()

In [53]:
coef_list = coef_list[0]

In [54]:
coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

In [55]:
coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
1787,nba,-2.174970
1494,lebron,-2.002509
1456,lakers,-1.779587
958,finals,-1.057334
4864,warriors,-1.032103
...,...,...
4964,yards,1.374443
441,browns,1.533043
1215,highlight,1.733735
4886,week,1.920702


### Let's throw out these unfair words and rerun

In [56]:
stop_words = set(stopwords.words('english'))

extra_stopwords = ['nba', 'basketball', 'football', 'nfl']

stop_words.update(extra_stopwords)

In [57]:
clean_train_data[:10]

['mikeklis melvin gordon not practicing i m told he reported to broncos headquarters this morning and met with vic fangio the coach sent his starting rb home for the day probably for best given circumstances',
 'britton jim nantz on bullandfox i see a fearlessness in watching the browns on tape the thing i like about them the best there s a fearlessness about them there s a swag there s an attitude there s something different about this team there s a mojo an attitude a personality',
 'how the manning family is helping bengals joe burrow transition to nfl archie texts me every week and that means a lot to not only me but my family as well',
 'highlight marshon lattimore stops mike williams just short of the first down marker on a critical 4th down',
 'bryant mcfadden man i m about ready to suit up again after hearing coachtomlin break down his famous sayings',
 'bradley beal on spending his whole career with the wizards that would mean the world man i m a loyal guy i want to be here i 

In [58]:
clean_test_data[0]

'the 19 20 nba season set new digital media records for the league'

In [59]:
type(_)

str

In [60]:
type(stopwords)

nltk.corpus.reader.wordlist.WordListCorpusReader

In [61]:
vectorizer = CountVectorizer(stop_words=stop_words,
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1270, 5000), (424, 5000))

In [62]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [63]:
lr.score(train_data_features, y_train)

0.9952755905511811

In [64]:
lr.score(test_data_features, y_test)

0.8891509433962265

In [65]:
coef_list = lr.coef_.tolist()
coef_list = coef_list[0]

coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
1585,lebron,-2.078734
1539,lakers,-1.792915
1010,finals,-1.392623
4409,shaq,-1.024667
4854,warriors,-1.017613
...,...,...
570,chargers,1.396801
473,browns,1.485262
962,falcons,1.493616
1304,highlight,1.800851


### Decision Tree

In [66]:
from sklearn.tree import DecisionTreeClassifier

In [67]:
tree = DecisionTreeClassifier()

In [68]:
tree.fit(train_data_features, y_train)

DecisionTreeClassifier()

In [69]:
tree.score(train_data_features, y_train)

1.0

In [70]:
tree.score(test_data_features, y_test)

0.8679245283018868

### Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier

In [72]:
forest = RandomForestClassifier(n_estimators=100)

In [73]:
forest.fit(train_data_features, y_train)

RandomForestClassifier()

In [74]:
forest.score(train_data_features, y_train)

1.0

In [75]:
forest.score(test_data_features, y_test)

0.8608490566037735

###  Matrix on Logistic Regression

In [76]:
from sklearn.metrics import confusion_matrix

In [77]:
y_pred = lr.predict(test_data_features)

In [78]:
cm = confusion_matrix(y_test, y_pred)

In [79]:
cm_df = pd.DataFrame(cm,
                    columns=['predict_neg', 'predict_pos'],
                    index = ['actual_neg', 'actual_pos'])

In [80]:
cm_df

Unnamed: 0,predict_neg,predict_pos
actual_neg,201,5
actual_pos,42,176


## Checking where our model failed

In [81]:
comparison_df = pd.DataFrame({'y_actual' : y_test,
             'y_predicted' : y_pred})

In [82]:
mismatch_df = comparison_df[comparison_df['y_actual'] != comparison_df['y_predicted']]

In [83]:
mismatch2_df = pd.concat([mismatch_df, X_test], axis = 1)

In [84]:
# All incorrect predictions with titles

mismatches = mismatch2_df.dropna()

In [85]:
mismatches

Unnamed: 0,y_actual,y_predicted,title
t3_jcb7zy,nfl,nba,who is gonna sign trubisky this offseason
t3_jb70us,nba,nfl,after jared dudley raised concerns about injury due to a quick restart patrick beverly responded with again basketball is a year round sport we don t wanna hear that sh t check jared dudley played more games after the restart than patrick beverly
t3_jakk85,nfl,nba,when do you think each current winless team will get their first victory
t3_j9u82v,nfl,nba,ctb nfl teams with the most players on the injured reserve
t3_jc83so,nfl,nba,mike davis free agent 2021
t3_jai3ry,nfl,nba,who are some players you think could will be traded before the november 3rd trade deadline
t3_jbuf38,nfl,nba,who are some fa s you d like to see your team target
t3_jbqosv,nba,nfl,highlight trevor booker makes one of the most insane circus shots in nba history
t3_jbtels,nfl,nba,evolution of every nfl team s stadium video
t3_jcb19g,nba,nfl,i was re listening to a 2013 episode my favorite comedy podcast this week and the host tom scharpling said something about both lebron being great and about how awful people like skip bayless are full of it that still seem relevant in 2020 so i made a little video for his little rant


### Let's try TF-IDF

Term Frequency / Inverse Document Frequency

TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)

IDF(w) = log_e(Total number of documents / Number of documents with term w in it)

In [86]:
tfidf_vec = TfidfVectorizer(stop_words=stop_words,
                            max_features=5000,
                            ngram_range=(1, 3))

In [87]:
train_data_features = tfidf_vec.fit_transform(clean_train_data)

test_data_features = tfidf_vec.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1270, 5000), (424, 5000))

In [88]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [89]:
lr.score(train_data_features, y_train)

0.984251968503937

In [90]:
lr.score(test_data_features, y_test)

0.9033018867924528

### Let's try on some other subreddits

In [91]:
train = pd.concat([politics_test, conservative_test])

In [92]:
X = train[['title']]
y = train['subreddit']

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [94]:
politics_test = politics_test.drop(columns='selftext')
conservative_test = conservative_test.drop(columns='selftext')

train = pd.concat([politics_test, conservative_test])
# tokenizer = RegexpTokenizer(r'\w+')
tokenizer = TweetTokenizer()

train['title'] = train['title'].map(lambda x: tokenizer.tokenize(x.lower()))
train['title'] = train['title'].map(lambda x: ' '.join(x))

In [95]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)
    
    
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [96]:
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

vocab = vectorizer.get_feature_names()

### Modeling

In [97]:
lr = LogisticRegression(penalty = 'l2')

In [98]:
train_data_features.shape, y_train.shape

((1224, 5000), (1224,))

In [99]:
lr.fit(train_data_features, y_train)

lr.score(train_data_features, y_train)

0.9771241830065359

In [100]:
lr.score(test_data_features, y_test)

0.7720588235294118

In [101]:
coef_list = lr.coef_.tolist()

coef_list = coef_list[0]

In [102]:
coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df = coef_df.sort_values(by = ['coefs'])
coef_df

Unnamed: 0,features,coefs
1328,hunter,-1.223450
4766,twitter,-1.152886
749,cuomo,-1.011294
285,battleground,-1.007246
4740,trump win,-0.964863
...,...,...
3299,qanon,0.922829
923,early,0.950903
4195,rudy,0.969229
4962,women,1.155065


In [103]:
coef_df.iloc[2400:2500]

Unnamed: 0,features,coefs
3563,real problem,-0.001693
3053,problem facebook twitter,-0.001693
3564,real problem facebook,-0.001693
4117,restricting,-0.001693
3052,problem facebook,-0.001693
...,...,...
4491,suffering,0.012117
3325,qanon shock,0.012143
3326,qanon shock surprise,0.012143
3709,refusal condemn qanon,0.012143


In [104]:
coef_df.tail(20)

Unnamed: 0,features,coefs
3984,republicans,0.65178
667,conspiracy,0.675285
269,barrett,0.680295
1162,gives,0.683888
4196,rudy giuliani,0.684115
2333,new,0.690608
3007,presidency,0.694649
1284,history,0.699572
688,coronavirus,0.727899
1079,fight,0.740249


In [105]:
coef_df.head(20)

Unnamed: 0,features,coefs
1328,hunter,-1.22345
4766,twitter,-1.152886
749,cuomo,-1.011294
285,battleground,-1.007246
4740,trump win,-0.964863
1872,media,-0.947331
35,20,-0.875633
4563,tech,-0.868533
373,big tech,-0.868533
4924,week,-0.868072
