In [61]:
import requests
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk import FreqDist
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from xgboost import XGBClassifier
import spacy
from spacy import displacy

import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from transformers import pipeline

from nltk.sentiment import sentiment_analyzer, vader, SentimentAnalyzer, SentimentIntensityAnalyzer

### Getting data from LA Subreddit ###

In [2]:
url_submissions = 'https://api.pushshift.io/reddit/search/submission'
url_comments = 'https://api.pushshift.io/reddit/search/comment'

In [3]:
LA_posts_cols = ['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'removed_by_category', 'retrieved_on', 'score', 'selftext',
       'send_replies', 'spoiler', 'steward_reports', 'stickied', 'subreddit',
       'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'thumbnail',
       'title', 'total_awards_received', 'url', 'whitelist_status', 'wls',
       'author_flair_background_color', 'author_flair_text_color',
       'link_flair_css_class', 'author_flair_template_id', 'crosspost_parent',
       'crosspost_parent_list', 'post_hint', 'preview', 'thumbnail_height',
       'thumbnail_width', 'media_metadata', 'media', 'media_embed',
       'secure_media', 'secure_media_embed', 'suggested_sort']

In [4]:
LA_posts_df = pd.DataFrame(columns = LA_posts_cols)
before_params = [1577865600] # first timestamp is equivalent to midnight PT on 01/01/2020 
for i in range(1, 21):
    resp = requests.get(url_submissions, params = {'subreddit': 'LosAngeles','size': 100,'before': before_params[-1]})
    try: 
        batch = pd.DataFrame(resp.json()['data'])
        LA_posts_df = pd.concat([LA_posts_df, batch], axis = 0 )
        before_params.append(batch['created_utc'].min())
        time.sleep(2)
    except:
        print(resp.status_code)

502


In [5]:
LA_posts_df.shape

(1900, 80)

In [6]:
LA_posts_df['title_selftext'] = LA_posts_df['title']+LA_posts_df['selftext']
LA_posts_df[['title_selftext', 'title', 'selftext']].head()

Unnamed: 0,title_selftext,title,selftext
0,Anyone in the LA area want to hang tomorrow?[r...,Anyone in the LA area want to hang tomorrow?,[removed]
1,Father’s 1985 Toyota MR2 stolen this AM from S...,Father’s 1985 Toyota MR2 stolen this AM from SFV,My dad’s MR2 was stolen from the front of his ...
2,Why does Downtown Los Angeles have so many lux...,Why does Downtown Los Angeles have so many lux...,
3,"Mt baldythinking of heading there tomorrow,cha...",Mt baldy,"thinking of heading there tomorrow,chains need..."
4,20//20,20//20,


In [7]:
LA_comments_cols = ['all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'created_utc', 'gildings',
       'id', 'is_submitter', 'link_id', 'locked', 'no_follow', 'parent_id',
       'permalink', 'retrieved_on', 'score', 'send_replies', 'steward_reports',
       'stickied', 'subreddit', 'subreddit_id', 'total_awards_received',
       'distinguished']

In [8]:
LA_comments_df = pd.DataFrame(columns = LA_comments_cols)
before_params = [1577865600] # first timestamp is equivalent to midnight PT on 01/01/2020 
for i in range(1, 21):
    resp = requests.get(url_comments, params = {'subreddit': 'LosAngeles','size': 100,'before': before_params[-1]})
    try: 
        batch = pd.DataFrame(resp.json()['data'])
        LA_comments_df = pd.concat([LA_comments_df, batch], axis = 0 )
        before_params.append(batch['created_utc'].min())
        time.sleep(2)
    except:
        print(resp.status_code)

In [9]:
LA_comments_df.shape

(2000, 35)

### Getting data from BOS Subreddit ###

In [10]:
BOS_posts_cols = ['all_awardings', 'allow_live_comments', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_text', 'author_flair_text_color',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'steward_reports', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subreddit_type', 'thumbnail', 'title',
       'total_awards_received', 'url', 'whitelist_status', 'wls',
       'link_flair_css_class', 'link_flair_template_id', 'link_flair_text',
       'author_flair_template_id', 'post_hint', 'preview', 'thumbnail_height',
       'thumbnail_width', 'removed_by_category', 'media', 'secure_media',
       'crosspost_parent', 'crosspost_parent_list', 'media_embed',
       'secure_media_embed']

In [11]:
BOS_posts_df = pd.DataFrame(columns = BOS_posts_cols)
before_params = [1577872800] # first timestamp is equivalent to midnight ET on 01/01/2020 
for i in range(1, 21):
    resp = requests.get(url_submissions, params = {'subreddit': 'boston','size': 100,'before': before_params[-1]})
    try: 
        batch = pd.DataFrame(resp.json()['data'])
        BOS_posts_df = pd.concat([BOS_posts_df, batch], axis = 0 )
        before_params.append(batch['created_utc'].min())
        time.sleep(2)
    except:
        print(resp.status_code)

502


In [12]:
BOS_posts_df.shape

(1900, 76)

In [13]:
BOS_posts_df['title_selftext'] = BOS_posts_df['title']+BOS_posts_df['selftext']
BOS_posts_df[['title_selftext', 'title', 'selftext']].head()

Unnamed: 0,title_selftext,title,selftext
0,Fireworks from the Seaport,Fireworks from the Seaport,
1,Missed the last train home from north station ...,Missed the last train home from north station :(,What are my options and I can’t afford an Uber...
2,Moving To BostonHello people of Boston I’m fro...,Moving To Boston,Hello people of Boston I’m from the south shor...
3,$71 to go from the North End to the South End,$71 to go from the North End to the South End,
4,Boston First Night Family Fireworks,Boston First Night Family Fireworks,


In [14]:
BOS_comments_cols = ['all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'created_utc', 'gildings',
       'id', 'is_submitter', 'link_id', 'locked', 'no_follow', 'parent_id',
       'permalink', 'retrieved_on', 'score', 'send_replies', 'steward_reports',
       'stickied', 'subreddit', 'subreddit_id', 'total_awards_received',
       'author_cakeday']

In [15]:
BOS_comments_df = pd.DataFrame(columns = BOS_comments_cols)
before_params = [1577865600] # first timestamp is equivalent to midnight PT on 01/01/2020 
for i in range(1, 21):
    resp = requests.get(url_comments, params = {'subreddit': 'boston','size': 100,'before': before_params[-1]})
    try: 
        batch = pd.DataFrame(resp.json()['data'])
        BOS_comments_df = pd.concat([BOS_comments_df, batch], axis = 0 )
        before_params.append(batch['created_utc'].min())
        time.sleep(2)
    except:
        print(resp.status_code)

502


In [16]:
BOS_comments_df.shape

(1900, 35)

In [None]:
BOS_comments_df['body']

**SAVING DATA AS PULLED**

In [17]:
BOS_comments_df.to_csv('bos_com.csv')
BOS_posts_df.to_csv('bos_pos.csv')
LA_comments_df.to_csv('la_com.csv')
LA_posts_df.to_csv('la_pos.csv')

**RETRIEVING DATA AS PULLED**

In [109]:
BOS_comments_df = pd.read_csv('bos_com.csv')
BOS_posts_df = pd.read_csv('bos_pos.csv')
LA_comments_df = pd.read_csv('la_com.csv')
LA_posts_df = pd.read_csv('la_pos.csv')

### Dealing with stop words

In [18]:
stops = stopwords.words('english')
additions = ["!","?",".",":",";", ",", "\'", "\"","*", "'", '"', "[", ']', '(', ")", '’']
stops.extend(additions)
#len(stops)

_More to be done here later_

### Building corpora

In [19]:
BOS_comments_corpus = ''
for i in range(len(BOS_comments_df)):
    BOS_comments_corpus+=(BOS_comments_df.iloc[i,list(BOS_comments_df.columns).index('body')])

LA_comments_corpus = ''
for i in range(len(LA_comments_df)):
    LA_comments_corpus+=(LA_comments_df.iloc[i,list(LA_comments_df.columns).index('body')])

BOS_posts_corpus = ''
for i in range(len(BOS_posts_df)):
    BOS_posts_corpus+=(BOS_posts_df.iloc[i,list(BOS_posts_df.columns).index('title_selftext')])
    
LA_posts_corpus = ''
for i in range(len(LA_posts_df)):
    LA_posts_corpus+=str(LA_posts_df.iloc[i,list(LA_posts_df.columns).index('title_selftext')])

### Count Vectorizer and dumb model on just comments

In [None]:
X = pd.concat([BOS_comments_df['body'], LA_comments_df['body']], axis = 0)

In [None]:
X = pd.concat([BOS_comments_df['body'], LA_comments_df['body']], axis = 0)

In [None]:
X.shape

In [None]:
y = np.concatenate([np.zeros_like(BOS_comments_df['body']), np.ones_like(LA_comments_df['body'])]).astype(int)

In [None]:
type(y)

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X,y)

In [None]:
cvec = CountVectorizer(stop_words='english', lowercase = True, max_features=10_000)
train_X_vec = cvec.fit_transform(train_X)
test_X_vec = cvec.transform(test_X)

In [None]:
train_y.mean()

In [None]:
test_y.mean()

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(train_X_vec, train_y)
mnb.score(train_X_vec, train_y)

In [None]:
mnb.score(test_X_vec, test_y)

### Removing location-specific references

In [21]:
en_mdl = spacy.load('en_core_web_md')
nlp = spacy.load('en_core_web_sm')

In [22]:
displacy.render(nlp(str('I miss Boston sometimes')), jupyter=True, style='ent')

In [23]:
doc = nlp(BOS_comments_corpus + BOS_posts_corpus)
bos_loc_gpe = []
for ent in doc.ents:
    if ent.label_ in ['LOC', 'GPE']:
        bos_loc_gpe.append(ent.text)

BOS_locations = list(set(bos_loc_gpe))
BOS_locations.append('Harvard') # adding some which were not detected by Spacy NER

In [34]:
poppers =  ['Airbnb', 'Aquarium', 'Atlanta', 'Austin', 'Australia', 'Automobiles',           
            'Billy','Black', 'Brno', 
           'California', 'Canada','Carolinas','Central','Charlotte', 'Chicago','Chlamydia', 'Comcast','Czech Republic',
           'DC','Davis','Dec 21.Found', 'Detroit', 'Downtown', 'Dubai',
           'East', 'England',
           'Florida','Hong Kong','Hotel','Houston','Hyde Park',
           'Jan', 'Japan', 'Kansas', 'Kansas City', 'Karaoke',
           'Meridian', 'Miami', 'Midwest','Minecraft!Just','Mississippi','Mueller','Mumbai',
            'NH','NY','Naples','Nashville',
           'States', 'Storrow', 'Suffolk County', 'Sweden','Syracuse', 'Syria', 
           'Tennessee','Texas',"The Combat Zoneit's",'Toronto',
            'U.K.','US','USA','Venmo', 'Vermont', 'Washington', 'Washington DC']
for popper in poppers:
    BOS_locations.remove(popper) #removing some which are not BOS specific

In [None]:
#BOS_locations.sort()
#BOS_locations

In [35]:
doc = nlp(LA_comments_corpus + LA_posts_corpus)
la_loc_gpe = []
for ent in doc.ents:
    if ent.label_ in ['LOC', 'GPE']:
        la_loc_gpe.append(ent.text)

LA_locations = list(set(la_loc_gpe))
la_adds = ["Westside", "Los", "Angeles"]
for add in la_adds:
    LA_locations.append(add)

In [36]:
la_poppers = ['Airbnb','Alabama', 'America', 'Alexandria', 'AmericaIt', 'Canada', 'Bernie', 'Building', 'California',
              'Central America', 'Chicago', 'Cincinnati', 'Colorado', 'Costa Rica', 'DC', 'Dallas', 'Denver','East',
              'Idaho', 'Indiana','Iowa','Japan', 'Korea', 'Las Vegas', 'London',
              'Midwest', 'Mueller', 'Texas', 'US', 'USA', 'Vermont', 'Washington', 'Washington DC', 'the United States',
              'Vancouver', 'Vegas', 'boston',  'Tokyo','Tsunamis', 'U.S.','United State','United States Of America']
for popper in la_poppers:
    LA_locations.remove(popper) #removing some which are not LA specific

In [None]:
#LA_locations.sort()
#LA_locations

In [37]:
LA_comments_df['body_np'] = LA_comments_df['body']
for loc in LA_locations:
    LA_comments_df['body_np'] = LA_comments_df['body_np'].apply(lambda x: x.replace(loc, "PLACE"))

In [38]:
BOS_comments_df['body_np'] = BOS_comments_df['body']
for loc in BOS_locations:
    BOS_comments_df['body_np'] = BOS_comments_df['body_np'].apply(lambda x: x.replace(loc, "PLACE"))

**SAVING DATA WITHOUT LOCATION REFERENCES**

In [17]:
BOS_comments_df.to_csv('bos_com.csv')
BOS_posts_df.to_csv('bos_pos.csv')
LA_comments_df.to_csv('la_com.csv')
LA_posts_df.to_csv('la_pos.csv')

### WHAT ABOUT WITHOUT LOCATION NAMES?

In [None]:
X = pd.concat([BOS_comments_df['body_np'], LA_comments_df['body_np']], axis = 0)

In [None]:
X.shape

In [None]:
y = np.concatenate([np.zeros_like(BOS_comments_df['body_np']), np.ones_like(LA_comments_df['body_np'])]).astype(int)

In [None]:
type(y)

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X,y)

In [None]:
cvec = CountVectorizer(stop_words='english', lowercase = True, max_features=10_000)
train_X_vec = cvec.fit_transform(train_X)
test_X_vec = cvec.transform(test_X)

In [None]:
train_y.mean()

In [None]:
test_y.mean()

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(train_X_vec, train_y)
mnb.score(train_X_vec, train_y)

### NOW ON ACTUAL POSTS

In [None]:
LA_posts_df['title_selftext_np'] = LA_posts_df['title_selftext']
for loc in LA_locations:
    LA_posts_df['title_selftext_np'] = LA_posts_df['title_selftext_np'].apply(lambda x: str(x).replace(loc, "PLACE"))

In [None]:
BOS_posts_df['title_selftext_np'] = BOS_posts_df['title_selftext']
for loc in BOS_locations:
    BOS_posts_df['title_selftext_np'] = BOS_posts_df['title_selftext_np'].apply(lambda x: x.replace(loc, "PLACE"))

In [None]:
X = pd.concat([BOS_posts_df['title_selftext_np'].dropna(), LA_posts_df['title_selftext_np'].dropna()], axis = 0)

In [None]:
X.shape

In [None]:
np.ones_like(LA_posts_df['title_selftext_np'].dropna()).sum()

In [None]:
X.dropna(inplace = True)

In [None]:
y = np.concatenate([np.zeros_like(BOS_posts_df['title_selftext_np'].dropna()), np.ones_like(LA_posts_df['title_selftext_np'].dropna())]).astype(int)

In [None]:
y.shape

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X,y)

In [None]:
cvec = CountVectorizer(stop_words='english', lowercase = True, ngram_range=(1,2))#max_features=10_000)
train_X_vec = cvec.fit_transform(train_X)
test_X_vec = cvec.transform(test_X)

In [None]:
train_y.mean()

In [None]:
test_y.mean()

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(train_X_vec, train_y)
mnb.score(train_X_vec, train_y)

In [None]:
mnb.score(test_X_vec, test_y)

**POSTS AND COMMENTS WITH LOCATIONS**

In [24]:
X = pd.concat([BOS_posts_df['title_selftext'].dropna(), BOS_comments_df['body'].dropna(), LA_posts_df['title_selftext'].dropna(), LA_comments_df['body'].dropna()], axis = 0)

In [None]:
X.shape

In [None]:
X.dropna(inplace = True)

In [25]:
y = np.concatenate([np.zeros_like(BOS_posts_df['title_selftext'].dropna()),
                    np.zeros_like(BOS_comments_df['body'].dropna()),
                    np.ones_like(LA_posts_df['title_selftext'].dropna()),
                    np.ones_like(LA_comments_df['body'].dropna())]).astype(int)

In [None]:
y.shape

In [26]:
train_X, test_X, train_y, test_y = train_test_split(X,y)

In [27]:
cvec = CountVectorizer(stop_words='english', lowercase = True, ngram_range=(1,2))#max_features=10_000)
train_X_vec = cvec.fit_transform(train_X)
test_X_vec = cvec.transform(test_X)

In [28]:
train_y.mean()

0.4974974974974975

In [29]:
test_y.mean()

0.506

In [30]:
mnb = MultinomialNB()

In [31]:
mnb.fit(train_X_vec, train_y)
mnb.score(train_X_vec, train_y)

0.968968968968969

In [32]:
mnb.score(test_X_vec, test_y)

0.774

### NOW ON POSTS and COMMENTS with no LOC GPS references

In [40]:
X = pd.concat([BOS_posts_df['title_selftext_np'].dropna(), BOS_comments_df['body_np'].dropna(), LA_posts_df['title_selftext_np'].dropna(), LA_comments_df['body_np'].dropna()], axis = 0)

In [41]:
X.shape

(3997,)

In [42]:
X.dropna(inplace = True)

In [43]:
y = np.concatenate([np.zeros_like(BOS_posts_df['title_selftext_np'].dropna()),
                    np.zeros_like(BOS_comments_df['body_np'].dropna()),
                    np.ones_like(LA_posts_df['title_selftext_np'].dropna()),
                    np.ones_like(LA_comments_df['body_np'].dropna())]).astype(int)

In [44]:
y.shape

(3997,)

In [45]:
train_X, test_X, train_y, test_y = train_test_split(X,y)

In [46]:
cvec = CountVectorizer(stop_words='english', lowercase = True, ngram_range=(1,2))#max_features=10_000)
train_X_vec = cvec.fit_transform(train_X)
test_X_vec = cvec.transform(test_X)

In [47]:
train_y.mean()

0.5028361695028362

In [48]:
test_y.mean()

0.49

In [49]:
mnb = MultinomialNB()

In [50]:
mnb.fit(train_X_vec, train_y)
mnb.score(train_X_vec, train_y)

0.9669669669669669

In [51]:
mnb.score(test_X_vec, test_y)

0.725

In [None]:
plot_confusion_matrix(mnb, test_X_vec, test_y);

In [None]:
X.iloc[[i for i in range(len(mnb.predict(test_X_vec))) if mnb.predict(test_X_vec[i]) == 1 and test_y[i] ==0]]

In [52]:
X_vec = cvec.transform(X)

In [53]:
params = {'alpha': np.linspace(0,1,10)}
mnb_gs = GridSearchCV(mnb, params, n_jobs = 5, verbose = 1, cv = 5)

mnb_gs.fit(X_vec, y)
print(f"Best CV score of {round(mnb_gs.best_score_,4)} achieved with alpha = {round(mnb_gs.best_params_['alpha'],4)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.


Best CV score of 0.6767 achieved with alpha = 0.2222


[Parallel(n_jobs=5)]: Done  50 out of  50 | elapsed:    1.6s finished


In [54]:
cvec_mnb_pipe = Pipeline([('cvec', CountVectorizer()), 
                          ('mnb', MultinomialNB())])

cvec_mnb_params = {'cvec__stop_words': ['english', None],
                    'cvec__ngram_range': [(1,1), (1,2), (2,2), (1,3), (3,3)],
                  'cvec__max_features': [5_000, 10_000, None],
                    'cvec__max_df': [0.7, 0.8, 0.9, None],
                  'mnb__alpha': np.linspace(0,1,10)}

cvec_mnb_gs = GridSearchCV(cvec_mnb_pipe, cvec_mnb_params, cv = 5, verbose = 1, n_jobs = 5)
cvec_mnb_gs.fit(X,y)
cvec_mnb_gs.best_score_

Fitting 5 folds for each of 900 candidates, totalling 4500 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    2.5s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   16.7s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:   52.3s
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:  1.6min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed:  2.7min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed:  4.4min
[Parallel(n_jobs=5)]: Done 2440 tasks      | elapsed:  6.1min
[Parallel(n_jobs=5)]: Done 3190 tasks      | elapsed:  8.3min
[Parallel(n_jobs=5)]: Done 4040 tasks      | elapsed: 10.1min
[Parallel(n_jobs=5)]: Done 4500 out of 4500 | elapsed: 11.6min finished


0.6734937421777222

In [55]:
cvec_mnb_gs.best_params_

{'cvec__max_df': 0.7,
 'cvec__max_features': None,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'mnb__alpha': 0.2222222222222222}

In [56]:
tfidf = TfidfVectorizer()

In [57]:
tfidf_mnb_pipe = Pipeline([('tfidf', TfidfVectorizer()), 
                          ('mnb', MultinomialNB())])

tfidf_mnb_params = {'tfidf__stop_words': ['english', None],
                    'tfidf__ngram_range': [(1,1), (1,2), (2,2), (1,3), (3,3)],
                  'tfidf__max_features': [5_000, 10_000, None],
                    'tfidf__max_df': [0.7, 0.8, 0.9, None],
                  'mnb__alpha': np.linspace(0,1,10)}

tfidf_mnb_gs = GridSearchCV(tfidf_mnb_pipe, tfidf_mnb_params, cv = 5, verbose = 1, n_jobs = 5)
tfidf_mnb_gs.fit(X,y)
tfidf_mnb_gs.best_score_

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    5.9s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   22.6s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:   59.1s
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:  1.7min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed:  2.7min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed:  4.1min
[Parallel(n_jobs=5)]: Done 2440 tasks      | elapsed:  5.7min
[Parallel(n_jobs=5)]: Done 3190 tasks      | elapsed:  7.5min
[Parallel(n_jobs=5)]: Done 4040 tasks      | elapsed:  9.4min
[Parallel(n_jobs=5)]: Done 4990 tasks      | elapsed: 11.6min
[Parallel(n_jobs=5)]: Done 6000 out of 6000 | elapsed: 13.8min finished


0.674241551939925

In [58]:
tfidf_mnb_gs.best_params_

{'mnb__alpha': 0.1111111111111111,
 'tfidf__max_df': 0.7,
 'tfidf__max_features': None,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

### MESSING WITH SENTIMENT

### Sentiment analysis

In [None]:
sia = SentimentIntensityAnalyzer()

sia.polarity_scores(BOS_comments_corpus_noprop)

In [None]:
sia.polarity_scores(LA_comments_corpus_noprop)

In [None]:
sia.polarity_scores(BOS_posts_corpus+BOS_comments_corpus)

In [None]:
sia.polarity_scores(LA_posts_corpus+LA_comments_corpus)

In [None]:
BOS_posts_df['sia_neg'] = BOS_posts_df['title_selftext_np'].apply(lambda x: sia.polarity_scores(x)['neg'])
BOS_posts_df['sia_pos'] = BOS_posts_df['title_selftext_np'].apply(lambda x: sia.polarity_scores(x)['pos'])
LA_posts_df['sia_neg'] = LA_posts_df['title_selftext_np'].apply(lambda x: sia.polarity_scores(x)['neg'])
LA_posts_df['sia_pos'] = LA_posts_df['title_selftext_np'].apply(lambda x: sia.polarity_scores(x)['pos'])

BOS_comments_df['sia_neg'] = BOS_comments_df['body_np'].apply(lambda x: sia.polarity_scores(x)['neg'])
BOS_comments_df['sia_pos'] = BOS_comments_df['body_np'].apply(lambda x: sia.polarity_scores(x)['pos'])
LA_comments_df['sia_neg'] = LA_comments_df['body_np'].apply(lambda x: sia.polarity_scores(x)['neg'])
LA_comments_df['sia_pos'] = LA_comments_df['body_np'].apply(lambda x: sia.polarity_scores(x)['pos'])

In [None]:
plt.figure(figsize = (15,12))
plt.subplot(2,2,1)
plt.title('strength of negative sentiment in posts')
plt.hist(BOS_posts_df['sia_neg'], histtype = 'step', label = 'Boston')
plt.hist(LA_posts_df['sia_neg'], histtype = 'step', label = 'LA')
plt.axvline(sia.polarity_scores(BOS_posts_corpus)['neg'], ls = '--', lw = 1, color = 'blue')
plt.axvline(sia.polarity_scores(LA_posts_corpus)['neg'], ls = '--', lw = 1, color = 'orange')
plt.legend();
plt.subplot(2,2,2)
plt.title('strength of positive sentiment in posts')
plt.hist(BOS_posts_df['sia_pos'], histtype = 'step', label = 'Boston')
plt.hist(LA_posts_df['sia_pos'], histtype = 'step',label = 'LA')
plt.axvline(sia.polarity_scores(BOS_posts_corpus)['pos'], ls = '--', lw = 1,  color = 'blue')
plt.axvline(sia.polarity_scores(LA_posts_corpus)['pos'], ls = '--',lw = 1, color = 'orange')
plt.legend();
plt.subplot(2,2,3)
plt.title('strength of negative sentiment in comments')
plt.hist(BOS_comments_df['sia_neg'], histtype = 'step', label = 'Boston')
plt.hist(LA_comments_df['sia_neg'], histtype = 'step', label = 'LA')
plt.axvline(sia.polarity_scores(BOS_comments_corpus)['neg'], ls = '--', lw = 1, color = 'blue')
plt.axvline(sia.polarity_scores(LA_comments_corpus)['neg'], ls = '--', lw = 1, color = 'orange')
plt.legend();
plt.subplot(2,2,4)
plt.title('strength of positive sentiment in comments')
plt.hist(BOS_comments_df['sia_pos'], histtype = 'step', label = 'Boston')
plt.hist(LA_comments_df['sia_pos'], histtype = 'step',label = 'LA')
plt.axvline(sia.polarity_scores(BOS_comments_corpus)['pos'], ls = '--', lw = 1,  color = 'blue')
plt.axvline(sia.polarity_scores(LA_comments_corpus)['pos'], ls = '--',lw = 1, color = 'orange')
plt.legend();

In [62]:
senti = pipeline('sentiment-analysis')
senti('I have the worst boyfriend on the planet')

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertModel: ['classifier', 'pre_classifier', 'dropout_19']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
Some layers from the model checkpoint at distilbert-b

[{'label': 'NEGATIVE', 'score': 0.9997004866600037}]

In [63]:
BOS_posts_df['senti_pn'] = BOS_posts_df['title_selftext_np'].apply(lambda x: senti(x[:512])[0]['label'])
BOS_posts_df['senti_score'] = BOS_posts_df['title_selftext_np'].apply(lambda x: senti(x[:512])[0]['score'])

In [65]:
LA_posts_df['senti_pn'] = LA_posts_df['title_selftext_np'].apply(lambda x: senti(str(x)[:512])[0]['label'])
LA_posts_df['senti_score'] = LA_posts_df['title_selftext_np'].apply(lambda x: senti(str(x)[:512])[0]['score'])

In [66]:
BOS_comments_df['senti_pn'] = BOS_comments_df['body_np'].apply(lambda x: senti(str(x)[:512])[0]['label'])
BOS_comments_df['senti_score'] = BOS_comments_df['body_np'].apply(lambda x: senti(str(x)[:512])[0]['score'])

In [67]:
LA_comments_df['senti_pn'] = LA_comments_df['body_np'].apply(lambda x: senti(str(x)[:512])[0]['label'])
LA_comments_df['senti_score'] = LA_comments_df['body_np'].apply(lambda x: senti(str(x)[:512])[0]['score'])

In [79]:
len(BOS_posts_df[BOS_posts_df['senti_pn'] == 'POSITIVE'])/len(BOS_posts_df)

0.347

In [80]:
len(LA_posts_df[LA_posts_df['senti_pn'] == 'POSITIVE'])/len(LA_posts_df)

0.36

In [81]:
len(BOS_posts_df[BOS_posts_df['senti_pn'] == 'NEGATIVE'])/len(BOS_posts_df)

0.561

In [82]:
len(LA_posts_df[LA_posts_df['senti_pn'] == 'NEGATIVE'])/len(LA_posts_df)

0.572

In [118]:
BOS_posts_df.head().iloc[:,-5:]

Unnamed: 0,sia_neg,sia_pos,senti_pn,senti_score,senti_pn_bin
0,0.0,0.0,POSITIVE,0.999072,1
1,0.109,0.0,NEGATIVE,-0.999253,-1
2,0.02,0.137,NEUTRAL,0.0,0
3,0.0,0.0,POSITIVE,0.76865,1
4,0.0,0.0,POSITIVE,0.998603,1


In [111]:
LA_posts_df['senti_score'].fillna(0, inplace = True)
LA_comments_df['senti_score'].fillna(0, inplace = True)
BOS_posts_df['senti_score'].fillna(0, inplace = True)
BOS_comments_df['senti_score'].fillna(0, inplace = True)

In [113]:
LA_posts_df['senti_pn'].fillna('NEUTRAL', inplace = True)
LA_comments_df['senti_pn'].fillna('NEUTRAL', inplace = True)
BOS_posts_df['senti_pn'].fillna('NEUTRAL', inplace = True)
BOS_comments_df['senti_pn'].fillna('NEUTRAL', inplace = True)

In [115]:
BOS_posts_df['senti_pn_bin'] = BOS_posts_df['senti_pn'].map({'POSITIVE': 1, 'NEUTRAL': 0, 'NEGATIVE': -1})
BOS_comments_df['senti_pn_bin'] = BOS_comments_df['senti_pn'].map({'POSITIVE': 1, 'NEUTRAL': 0, 'NEGATIVE': -1})
LA_posts_df['senti_pn_bin'] = LA_posts_df['senti_pn'].map({'POSITIVE': 1, 'NEUTRAL': 0, 'NEGATIVE': -1})
LA_comments_df['senti_pn_bin'] = LA_comments_df['senti_pn'].map({'POSITIVE': 1, 'NEUTRAL': 0, 'NEGATIVE': -1})

In [117]:
LA_posts_df['senti_score'] = LA_posts_df['senti_pn_bin']*LA_posts_df['senti_score']
LA_comments_df['senti_score']= LA_comments_df['senti_pn_bin']*LA_comments_df['senti_score']
BOS_posts_df['senti_score'] = BOS_posts_df['senti_pn_bin']*BOS_posts_df['senti_score']
BOS_comments_df['senti_score'] = BOS_comments_df['senti_pn_bin']*BOS_comments_df['senti_score']

In [124]:
X = pd.concat([BOS_posts_df[['sia_neg', 'sia_pos', 'senti_score']], BOS_comments_df[['sia_neg', 'sia_pos', 'senti_score']],
              LA_posts_df[['sia_neg', 'sia_pos', 'senti_score']], LA_comments_df[['sia_neg', 'sia_pos', 'senti_score']]], axis = 0)

In [128]:
y = np.concatenate([np.zeros_like(BOS_posts_df.index),
                    np.zeros_like(BOS_comments_df.index),
                    np.ones_like(LA_posts_df.index),
                    np.ones_like(LA_comments_df.index)]).astype(int)

In [134]:
train_X, test_X,train_y, test_y = train_test_split(X,y)

In [135]:
logr = LogisticRegression()
logr.fit(train_X, train_y)
logr.score(train_X, train_y)

0.5223333333333333

In [136]:
logr.score(test_X, test_y)

0.518

**SAVING DATA WITH SENTIMENT SCORES**

In [101]:
BOS_comments_df.to_csv('bos_com.csv')
BOS_posts_df.to_csv('bos_pos.csv')
LA_comments_df.to_csv('la_com.csv')
LA_posts_df.to_csv('la_pos.csv')

**EXPLORATIVE ANALYSIS**

In [73]:
LA_comments_df[LA_posts_df['media_only']=='True']

  res_values = method(rvalues)


Unnamed: 0.1,Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,...,subreddit,subreddit_id,total_awards_received,distinguished,author_cakeday,body_np,sia_neg,sia_pos,senti_pn,senti_score


In [None]:
BOS_tokens = word_tokenize(BOS_posts_corpus+BOS_comments_corpus)

BOS_tokens_no_stops = [token for token in BOS_tokens if not token in stops]
BOS_tokens_no_stops_np = [token.lower() for token in BOS_tokens_no_stops if not token in BOS_locations]

FreqDist(BOS_tokens_no_stops_np).most_common(10)

In [None]:
LA_tokens = word_tokenize(LA_posts_corpus + LA_comments_corpus)

LA_tokens_no_stops = [token for token in LA_tokens if not token in stops]
LA_tokens_no_stops_np = [token.lower() for token in LA_tokens_no_stops if not token in LA_locations]

[word for word, freq in FreqDist(LA_tokens_no_stops_np).most_common(10)]

In [None]:
more_stops = [word for word,_ in FreqDist(BOS_tokens_no_stops_np).most_common(100) if word in [word for word, freq in FreqDist(LA_tokens_no_stops_np).most_common(100)]]

In [None]:
stops

In [None]:
more_stops

In [None]:
stops.extend(more_stops)

In [None]:
len(stops)

### DATA EXPORTS

In [None]:
BOS_comments_df.to_csv('bos_com.csv')
BOS_posts_df.to_csv('bos_pos.csv')
LA_comments_df.to_csv('la_com.csv')
LA_posts_df.to_csv('la_pos.csv')

### OTHER MODELS

**K-Nearest Neighbors**

In [None]:
knn = KNeighborsClassifier()
knn.fit(train_X_vec, train_y)
knn.score(train_X_vec, train_y)

In [None]:
knn.score(test_X_vec, test_y)

In [None]:
for k in range(1,11):
    knn = KNeighborsClassifier(k)
    print(f"k = {k} --> score = {round(cross_val_score(knn, X_vec, y).mean(),4)}")

In [None]:
cvec_mnb_pipe = Pipeline([('cvec', CountVectorizer()), 
                          ('mnb', MultinomialNB())])

cvec_mnb_params = {'cvec__stop_words': ['english', None],
                    'cvec__ngram_range': [(1,1), (1,2), (2,2), (1,3), (3,3)],
                  'cvec__max_features': [5_000, 10_000, None],
                    'cvec__max_df': [0.7, 0.8, 0.9],
                  'mnb__alpha': np.linspace(0,1,10)}

cvec_mnb_gs = GridSearchCV(cvec_mnb_pipe, cvec_mnb_params, cv = 5, verbose = 1, n_jobs = 5)
cvec_mnb_gs.fit(X,y)
cvec_mnb_gs.best_score_

**Random Forest Classifier**

In [None]:
RandomForestClassifier()

In [None]:
rfc = RandomForestClassifier()
rfc.fit(train_X_vec, train_y)
rfc.score(train_X_vec, train_y)

In [None]:
rfc.score(test_X_vec, test_y)

In [None]:
params = {'bootstrap': [True],
 'ccp_alpha': np.logspace(0,1,10),
 'criterion': ['gini'],
 'max_depth': [2,5,10,15],
 'min_samples_leaf': [1,2,3,4,5],
 'n_estimators': [25,50,75,100,125]}

rfc_gs = GridSearchCV(rfc, params, cv = 5, n_jobs = 5, verbose = 1)

rfc_gs.fit(X_vec, y)
rfc_gs.best_score_

In [None]:
rfc_gs.best_params_

In [None]:
cvec_mnb_pipe = Pipeline([('cvec', CountVectorizer()), 
                          ('mnb', MultinomialNB())])

cvec_mnb_params = {'cvec__stop_words': ['english', None],
                    'cvec__ngram_range': [(1,1), (1,2), (2,2), (1,3), (3,3)],
                  'cvec__max_features': [5_000, 10_000, None],
                    'cvec__max_df': [0.7, 0.8, 0.9],
                  'mnb__alpha': np.linspace(0,1,10)}

cvec_mnb_gs = GridSearchCV(cvec_mnb_pipe, cvec_mnb_params, cv = 5, verbose = 1, n_jobs = 5)
cvec_mnb_gs.fit(X,y)
cvec_mnb_gs.best_score_

### XGBC

In [None]:
xgbc = XGBClassifier(use_label_encoder=False, eval_metric = 'logloss')
xgbc.fit(train_X_vec, train_y)
xgbc.score(train_X_vec, train_y)

In [None]:
xgbc.score(test_X_vec, test_y)