In [1]:
import re
import time

import numpy as np
import pandas as pd
import requests

In [2]:
!pip install progressbar2
from progressbar import progressbar

Collecting progressbar2
  Downloading progressbar2-3.53.1-py2.py3-none-any.whl (25 kB)
Collecting python-utils>=2.3.0
  Downloading python_utils-2.4.0-py2.py3-none-any.whl (12 kB)
Installing collected packages: python-utils, progressbar2
Successfully installed progressbar2-3.53.1 python-utils-2.4.0


## Scraping

### Scraping Functions

In [3]:
# function to scrape reddit page (takes a reddit .json url)
# returns posts 

def scraper_bike(url):
    headers = {'User-Agent' : 'override this bad boy!'}
    posts = []
    after = {}

    for page in progressbar(range(40)):
        params = {'after': after}
        pagepull = requests.get(url=url, params=params, headers=headers)
        page_dict = pagepull.json()
        posts.extend(page_dict['data']['children'])
        after = page_dict['data']['after']
        time.sleep(.2)
        
    return posts

In [4]:
# function to convert posts to DataFrame - won't allow duplicate posts since unique id 'name' is set as index
# Extract: name (as index) and subreddit, selftext, title (as columns)


def posts_to_df(post_list):
    i = 0
    post_dict = {}

    for post in post_list:
        ind = post_list[i]['data']
        post_dict[ind['name']] = [ind['subreddit'], ind['title'], ind['selftext']]
        i += 1

    df_name = pd.DataFrame(post_dict)
    df_name = df_name.T
    df_name.columns = ['subreddit', 'title', 'selftext']

    return df_name

In [5]:
# takes scraper function and url - outputs dataframe

def scrape_to_df(scrape_func, url):
    
    return posts_to_df(scrape_func(url))

## Run Scrape

In [6]:
# You can also put in any 2 subreddits in as the URL and get results for those

nfltest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nfl.json')
nbatest = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/nba.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:22 Time:  0:00:22
100% (40 of 40) |########################| Elapsed Time: 0:00:22 Time:  0:00:22


In [7]:
politics_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/politics.json')
conservative_test = scrape_to_df(scraper_bike, 'https://www.reddit.com/r/conservative.json')

100% (40 of 40) |########################| Elapsed Time: 0:00:23 Time:  0:00:23
100% (40 of 40) |########################| Elapsed Time: 0:00:21 Time:  0:00:21


In [8]:
nbatest.shape

(826, 3)

In [9]:
nfltest.head()

Unnamed: 0,subreddit,title,selftext
t3_jcche7,nfl,Free Talk Friday,"Welcome to today's open thread, where /r/nfl u..."
t3_jb73pb,nfl,Official r/NFL Week 5 Power Rankings,Welcome to the week 5 **Official R/NFL Power R...
t3_jc7m8n,nfl,"[Caplan] #Colts statement: This morning, we we...",
t3_jcarxf,nfl,More positive covid tests to be announced late...,Have an inside source - mods can delete if tha...
t3_jc9955,nfl,Ex-Washington cheerleaders shaken by lewd vide...,


In [10]:
nfltest.shape

(870, 3)

### Data Cleaning / Preprocessing

In [13]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

pd.set_option('max_colwidth', 300)

In [14]:
# drop column

nfltest = nfltest.drop(columns='selftext')
nbatest = nbatest.drop(columns='selftext')

In [15]:
# merge subreddit data

train = pd.concat([nfltest, nbatest])

In [16]:
train

Unnamed: 0,subreddit,title
t3_jcche7,nfl,Free Talk Friday
t3_jb73pb,nfl,Official r/NFL Week 5 Power Rankings
t3_jc7m8n,nfl,"[Caplan] #Colts statement: This morning, we were informed that several individuals within our organization have tested posted for COVID-19. The team is currently in the process of confirming those tests...."
t3_jcarxf,nfl,More positive covid tests to be announced later today for Patriots
t3_jc9955,nfl,Ex-Washington cheerleaders shaken by lewd videos: ‘I don’t think they viewed us as people’
...,...,...
t3_jb4fda,nba,All-Time Point Guards
t3_jaa0s4,nba,How well would Prime Scottie Pippen do at guarding prime Lebron?
t3_jau53x,nba,Celtic's &amp; Laker's 17 Titles
t3_jcdd85,nba,Danny Green has more championship rings than Kawhi Leonard.


##### Tokenize (grab only word characters)

In [44]:
word_tokenizer = RegexpTokenizer(r'\w+')

In [45]:
print(r'Hello\nWorld')
print('Hello\nWorld')

Hello\nWorld
Hello
World


Word tokenize

In [46]:
train = pd.concat([nfltest, nbatest])


In [47]:
train['title'] = train['title'].map(lambda x: ' '.join(word_tokenizer.tokenize(x.lower())))

In [48]:
train['title'][0:5]

t3_jcche7                                                                                                                                                                                        free talk friday
t3_jb73pb                                                                                                                                                                    official r nfl week 5 power rankings
t3_jc7m8n    caplan colts statement this morning we were informed that several individuals within our organization have tested posted for covid 19 the team is currently in the process of confirming those tests
t3_jcarxf                                                                                                                                      more positive covid tests to be announced later today for patriots
t3_jc9955                                                                                                                 ex washington cheerleaders shaken by l

With TweetTokenizer

In [49]:
tknzr = TweetTokenizer()

In [50]:
train['title'] = train['title'].map(lambda x: tknzr.tokenize(x))

In [51]:
train['title']

t3_jcche7                                                                                                                                                                                                                    [free, talk, friday]
t3_jb73pb                                                                                                                                                                                            [official, r, nfl, week, 5, power, rankings]
t3_jc7m8n    [caplan, colts, statement, this, morning, we, were, informed, that, several, individuals, within, our, organization, have, tested, posted, for, covid, 19, the, team, is, currently, in, the, process, of, confirming, those, tests]
t3_jcarxf                                                                                                                                                          [more, positive, covid, tests, to, be, announced, later, today, for, patriots]
t3_jc9955                       

In [52]:
# rejoin list of tokenized words into single string for each row

train['title'] = train['title'].map(lambda x: ' '.join(x))

In [53]:
train['title'][0:5]

t3_jcche7                                                                                                                                                                                        free talk friday
t3_jb73pb                                                                                                                                                                    official r nfl week 5 power rankings
t3_jc7m8n    caplan colts statement this morning we were informed that several individuals within our organization have tested posted for covid 19 the team is currently in the process of confirming those tests
t3_jcarxf                                                                                                                                      more positive covid tests to be announced later today for patriots
t3_jc9955                                                                                                                 ex washington cheerleaders shaken by l

### Train test split and converting series to list of strings then to array

In [54]:
X = train[['title']]
y = train['subreddit']

In [None]:
X

In [55]:
y.value_counts(normalize=True)

nfl    0.512972
nba    0.487028
Name: subreddit, dtype: float64

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    random_state=42,
                                                    stratify=y)

In [57]:
# baseline is

y.value_counts(normalize=True)

nfl    0.512972
nba    0.487028
Name: subreddit, dtype: float64

In [58]:
y_train.value_counts(normalize=True)

nfl    0.513365
nba    0.486635
Name: subreddit, dtype: float64

In [59]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)

In [60]:
len(clean_train_data)

1272

In [61]:
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [62]:
len(clean_test_data)

424

In [63]:
clean_train_data

['jones at this point we understand the nfl s first option would be to delay broncos pats to monday or tuesday still don t think we are at the week 18 glass break but inching closer',
 'nfl update browns hc kevin stefanski says wr odell beckham jr was sent home with an illness this morning with this day and age we just gotta be so careful in this environment he s just feeling under the weather so an abundance of caution',
 'gordon hayward after my injury hearing from yg_trece a guy who had been through it made a huge difference dak if there s anything i can do to help don t hesitate to reach out you re not alone',
 'what is the best possible team out of these players',
 'oc highlight on nov 23 1986 mark moseley was working as an analyst for cbs on the nfl today on nov 30 moseley hit the game winning field goal for the browns in ot this is the story of how moseley went from the studio to the on field hero in the span of a week',
 'following this postseason what player s do you intend to

### Count Vectorizer

In [64]:
# instantiate our CountVectorizer. This counts the number of appearances of all the words in our training data and
# eliminates common english stop words. 5000 max features works well for our purposes (tested various numbers). Our
# data is already preprocessed and tokenized manually earlier. ngram_range is 1,3, although all or nearly all our
# features are single words

vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
#                              max_df= 0.9,
#                              min_df= 0.001,
                             ngram_range=(1, 3))

In [65]:
# fit our training data and test data lists to our count_vectorizer

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

In [66]:
train_data_features

<1272x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 16226 stored elements in Compressed Sparse Row format>

In [67]:
16670 / (1223*5000)

0.0027260834014717905

In [68]:
# convert to array

train_data_features = train_data_features.toarray()

In [69]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [70]:
pd.DataFrame(train_data_features, columns=vectorizer.get_feature_names())

Unnamed: 0,00,000,04,05,07,10,10 games,10 years,10 years ago,100,...,ypc,zac,zac taylor,zach,zach ertz,zach lowe,zero,zimmer,zion,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1268,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1269,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1270,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
train_data_features.shape

(1272, 5000)

In [72]:
# check shapes

train_data_features.shape, test_data_features.shape

((1272, 5000), (424, 5000))

In [73]:
# I wanted check that the features corpus was as expected - removed print statement for readability

vocab = vectorizer.get_feature_names()

In [74]:
vocab

['00',
 '000',
 '04',
 '05',
 '07',
 '10',
 '10 games',
 '10 years',
 '10 years ago',
 '100',
 '100 percent',
 '102',
 '105',
 '105 assists',
 '1053thefan',
 '11',
 '116',
 '12',
 '13',
 '13 score',
 '13 score games',
 '14',
 '14 starts',
 '15',
 '15 years',
 '15 years ago',
 '158',
 '16',
 '17',
 '17 games',
 '17 pts',
 '17 titles',
 '18',
 '18th',
 '18th week',
 '19',
 '19 list',
 '19 protocols',
 '19 reserve',
 '19 reserve list',
 '19 testing',
 '19 testing source',
 '19 transmission',
 '1949',
 '1970',
 '1970 71',
 '1975',
 '1981',
 '1984',
 '1987',
 '1998',
 '1m',
 '1m roster',
 '1m roster bonus',
 '1st',
 '1st round',
 '20',
 '2000',
 '2000s',
 '2002',
 '2003',
 '2006',
 '2007',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2017',
 '2018',
 '2019',
 '2019 20',
 '2019 kyler',
 '2019 kyler murray',
 '2019 raptors',
 '2019 season',
 '2020',
 '2020 21',
 '2020 21 season',
 '2020 lakers',
 '2020 nba',
 '2020 nba draft',
 '2020 nba season',
 '2020 season',
 '2020 sta

## MODELING

### Logistic Regression

In [75]:
from sklearn.linear_model import LogisticRegression

In [76]:
# fit logistic regression model

lr = LogisticRegression()

In [77]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [78]:
lr.score(train_data_features, y_train)

0.9944968553459119

In [79]:
lr.score(test_data_features, y_test)

0.9268867924528302

### Feature comparison

Creates a dataframe that matches features to coefficients

In [80]:
coef_list = lr.coef_.tolist()

In [81]:
coef_list = coef_list[0]

In [82]:
coef_df = pd.DataFrame({'features': vectorizer.get_feature_names(),
                        'coefs': coef_list})

In [83]:
coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
1797,nba,-2.298442
1489,lebron,-2.224275
1454,lakers,-1.921436
4446,shaq,-1.112846
316,basketball,-1.019847
...,...,...
908,falcons,1.425073
429,browns,1.587707
1193,highlight,1.785951
4866,week,2.041039


### Let's throw out these unfair words and rerun

In [84]:
stop_words = set(stopwords.words('english'))

extra_stopwords = ['nba', 'basketball', 'football', 'nfl']

stop_words.update(extra_stopwords)

In [85]:
clean_train_data[:10]

['jones at this point we understand the nfl s first option would be to delay broncos pats to monday or tuesday still don t think we are at the week 18 glass break but inching closer',
 'nfl update browns hc kevin stefanski says wr odell beckham jr was sent home with an illness this morning with this day and age we just gotta be so careful in this environment he s just feeling under the weather so an abundance of caution',
 'gordon hayward after my injury hearing from yg_trece a guy who had been through it made a huge difference dak if there s anything i can do to help don t hesitate to reach out you re not alone',
 'what is the best possible team out of these players',
 'oc highlight on nov 23 1986 mark moseley was working as an analyst for cbs on the nfl today on nov 30 moseley hit the game winning field goal for the browns in ot this is the story of how moseley went from the studio to the on field hero in the span of a week',
 'following this postseason what player s do you intend to

In [86]:
clean_test_data[0]

'with his recent finals appearance lebron james has a better chance of making the finals on any given season 58 8 than steph curry making a 3 pointer 43 5'

In [87]:
type(_)

str

In [88]:
type(stopwords)

nltk.corpus.reader.wordlist.WordListCorpusReader

In [89]:
vectorizer = CountVectorizer(stop_words=stop_words,
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1272, 5000), (424, 5000))

In [90]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [91]:
lr.score(train_data_features, y_train)

0.9921383647798742

In [92]:
lr.score(test_data_features, y_test)

0.8867924528301887

In [93]:
coef_list = lr.coef_.tolist()
coef_list = coef_list[0]

coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df.sort_values(by = ['coefs'])

Unnamed: 0,features,coefs
1568,lebron,-2.298192
1518,lakers,-1.972501
4390,shaq,-1.084512
2023,player,-1.041755
543,championship,-1.036300
...,...,...
1399,jets,1.343804
463,browns,1.507184
953,falcons,1.517352
1267,highlight,1.788039


### Decision Tree

In [94]:
from sklearn.tree import DecisionTreeClassifier

In [95]:
tree = DecisionTreeClassifier()

In [96]:
tree.fit(train_data_features, y_train)

DecisionTreeClassifier()

In [97]:
tree.score(train_data_features, y_train)

0.9992138364779874

In [98]:
tree.score(test_data_features, y_test)

0.8372641509433962

### Random Forest

In [99]:
from sklearn.ensemble import RandomForestClassifier

In [100]:
forest = RandomForestClassifier(n_estimators=100)

In [101]:
forest.fit(train_data_features, y_train)

RandomForestClassifier()

In [102]:
forest.score(train_data_features, y_train)

0.9992138364779874

In [103]:
forest.score(test_data_features, y_test)

0.8867924528301887

###  Matrix on Logistic Regression

In [104]:
from sklearn.metrics import confusion_matrix

In [105]:
y_pred = lr.predict(test_data_features)

In [106]:
cm = confusion_matrix(y_test, y_pred)

In [107]:
cm_df = pd.DataFrame(cm,
                    columns=['predict_neg', 'predict_pos'],
                    index = ['actual_neg', 'actual_pos'])

In [108]:
cm_df

Unnamed: 0,predict_neg,predict_pos
actual_neg,196,11
actual_pos,37,180


## Checking where our model failed

In [109]:
comparison_df = pd.DataFrame({'y_actual' : y_test,
             'y_predicted' : y_pred})

In [110]:
mismatch_df = comparison_df[comparison_df['y_actual'] != comparison_df['y_predicted']]

In [111]:
mismatch2_df = pd.concat([mismatch_df, X_test], axis = 1)

In [112]:
# All incorrect predictions with titles

mismatches = mismatch2_df.dropna()

In [113]:
mismatches

Unnamed: 0,y_actual,y_predicted,title
t3_j9cbgl,nfl,nba,younghoe koo has his first missed onside kick of his career he was previously 5 5 on onside successes
t3_jb363j,nba,nfl,all of this happened in one hawks grizzlies game this year
t3_jc3blu,nfl,nba,rams showcase jalen ramsey s versatility see him as lebron james type player
t3_jb331b,nfl,nba,nfloncbs 231 nfl players have been targeted at least 8 times this season there is only one player who has netted a perfect passer rating 158 3 when targeted nelson agholor
t3_jbtels,nfl,nba,evolution of every nfl team s stadium video
t3_jbaomy,nfl,nba,what teams are playing above below their record
t3_jbadba,nba,nfl,three teams during the 2019 20 season had back to back number one overall picks on their roster
t3_jcdgb8,nba,nfl,highlight emotional jr smith teary eyed after winning the championship
t3_jbqosv,nba,nfl,highlight trevor booker makes one of the most insane circus shots in nba history
t3_jcche7,nfl,nba,free talk friday


### Let's try TF-IDF

Term Frequency / Inverse Document Frequency

TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)

IDF(w) = log_e(Total number of documents / Number of documents with term w in it)

In [114]:
tfidf_vec = TfidfVectorizer(stop_words=stop_words,
                            max_features=5000,
                            ngram_range=(1, 3))

In [115]:
train_data_features = tfidf_vec.fit_transform(clean_train_data)

test_data_features = tfidf_vec.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

((1272, 5000), (424, 5000))

In [116]:
lr.fit(train_data_features, y_train)

LogisticRegression()

In [117]:
lr.score(train_data_features, y_train)

0.9858490566037735

In [118]:
lr.score(test_data_features, y_test)

0.9009433962264151

### Let's try on some other subreddits

In [119]:
train = pd.concat([politics_test, conservative_test])

In [120]:
X = train[['title']]
y = train['subreddit']

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [122]:
politics_test = politics_test.drop(columns='selftext')
conservative_test = conservative_test.drop(columns='selftext')

train = pd.concat([politics_test, conservative_test])
# tokenizer = RegexpTokenizer(r'\w+')
tokenizer = TweetTokenizer()

train['title'] = train['title'].map(lambda x: tokenizer.tokenize(x.lower()))
train['title'] = train['title'].map(lambda x: ' '.join(x))

In [123]:
# create our training data list - this is a list of strings, with each string being a post title

clean_train_data = []

for traindata in X_train['title']:
    clean_train_data.append(traindata)
    
    
# create test data list

clean_test_data = []

for testdata in X_test['title']:
    clean_test_data.append(testdata)

In [124]:
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=5000,
                             ngram_range=(1, 3))

train_data_features = vectorizer.fit_transform(clean_train_data)

test_data_features = vectorizer.transform(clean_test_data)

train_data_features = train_data_features.toarray()

train_data_features.shape, test_data_features.shape

vocab = vectorizer.get_feature_names()

### Modeling

In [125]:
lr = LogisticRegression(penalty = 'l2')

In [126]:
train_data_features.shape, y_train.shape

((1224, 5000), (1224,))

In [127]:
lr.fit(train_data_features, y_train)

lr.score(train_data_features, y_train)

0.9852941176470589

In [128]:
lr.score(test_data_features, y_test)

0.7671568627450981

In [129]:
coef_list = lr.coef_.tolist()

coef_list = coef_list[0]

In [130]:
coef_df = pd.DataFrame({'features' : vectorizer.get_feature_names(),
                       'coefs' : coef_list})

coef_df = coef_df.sort_values(by = ['coefs'])
coef_df

Unnamed: 0,features,coefs
4756,twitter,-1.356453
1893,media,-1.182694
1395,hunter,-1.134343
374,big tech,-1.008090
4728,trump win,-0.963823
...,...,...
1684,mail,0.893423
3052,presidency,0.899607
3276,qanon,0.925481
31,2020,1.063377


In [131]:
coef_df.iloc[2400:2500]

Unnamed: 0,features,coefs
3996,republican skepticism,-0.004286
3997,republican skepticism big,-0.004286
696,conspiracy theory,-0.004084
1862,mcenany locked,-0.004051
1863,mcenany locked twitter,-0.004051
...,...,...
2550,nominee scrap affordable,0.010367
3386,quotidian reality,0.010367
2559,nonexistent religious,0.010367
2560,nonexistent religious bigotry,0.010367


In [132]:
coef_df.tail(20)

Unnamed: 0,features,coefs
1276,groups,0.682667
4184,rudy,0.689116
918,donald,0.69026
1446,intel,0.692348
1526,justice,0.69731
4000,republicans,0.701832
122,ahead,0.705749
641,climate,0.707644
1585,lead,0.709526
1095,fake,0.738343


In [133]:
coef_df.head(20)

Unnamed: 0,features,coefs
4756,twitter,-1.356453
1893,media,-1.182694
1395,hunter,-1.134343
374,big tech,-1.00809
4728,trump win,-0.963823
384,black,-0.946078
459,burisma,-0.928126
771,cuomo,-0.900558
4447,stop,-0.88533
159,america,-0.880326
