In [5]:
# how to just get submissions

# /reddit/search/submission
import pandas as pd
import requests
import pandas as pd
import requests
from datetime import datetime
import time
import random

# custom helper functions for this project (located in nlp_helper_functions.py)
from helper_functions.nlp_helper_function import get_df
from helper_functions.nlp_helper_function import pre_process

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer


In [None]:
# gathering data for the respective subreddits 'AskPolitics' and 'Conspiracy'
df = get_df('AskPolitics')
df.to_csv('./data/ask_politics.csv')
df_main = get_df('conspiracy')
df_main.to_csv('./data/conspiracy.csv')

The get_df is a helper function that can be found in the .py file 'helper functions'. As a general overview the function utilizes the request library to gather data utilizing the   Reddit Pushshift API and is only able to gather selftext and title text data of up to 3,000 posts.

In [6]:
df_pol = pd.read_csv('../data/ask_politics.csv')
print('orignal political df shape: ',df_pol.shape)
df_pol[(df_pol['selftext'].duplicated()) & (df_pol['selftext'] !='')].sort_values(ascending=False, by='selftext')['selftext'].value_counts(ascending=False)
print('shape of duplicates',df_pol[(df_pol['selftext'].duplicated()) & (df_pol['selftext'] !='')& (df_pol['selftext'] !='[removed]')& (df_pol['selftext'] !='[deleted]')].sort_values(ascending=False, by='selftext').shape)
index_pol = df_pol[(df_pol['selftext'].duplicated())].index
df_pol.drop(index=index_pol,inplace=True)
print('after removing duplicates',df_pol.shape)

orignal political df shape:  (2889, 85)
shape of duplicates (852, 85)
after removing duplicates (1984, 85)


In [7]:
df_con = pd.read_csv('../data/conspiracy.csv')
print('orignal conspiracy df shape: ',df_con.shape)
df_con[(df_con['selftext'].duplicated()) & (df_con['selftext'] !='')& (df_con['selftext'] !='[removed]')& (df_con['selftext'] !='[deleted]')].sort_values(ascending=False, by='selftext')['selftext'].value_counts(ascending=False)
print('shape of duplicates',df_con[(df_con['selftext'].duplicated()) & (df_con['selftext'] !='')& (df_con['selftext'] !='[removed]')& (df_con['selftext'] !='[deleted]')].sort_values(ascending=False, by='selftext').shape)
index_con = df_con[(df_con['selftext'].duplicated())].index
df_con.drop(index=index_con,inplace=True)
print('after removing duplicates',df_con.shape)

orignal conspiracy df shape:  (2832, 79)
shape of duplicates (1678, 79)
after removing duplicates (1019, 79)


A majority of these duplicates come from having NaN values or deleted posts from the moderators. However to maintain balanced classes for the future classification models to train on the shape of the smallest. In this case df_con will be used for the baseline to have an equal amount of observations, once the duplicated rows are dropped. 

In [10]:
# creating target variables for each respective df
df_con['target'] = 1
df_pol['target'] = 0

# isolating columns to be utilized in classification models
wrk_con = df_con[['selftext','title','target']].copy()
wrk_pol = df_pol[['selftext','title','target']].copy()

# balancing out these classes
wrk_con.drop(wrk_con.tail(19).index,inplace=True)
wrk_pol.drop(wrk_pol.tail(984).index,inplace=True)
print('wrk_con columns: ',wrk_con.shape)
print('wrk_pol columns: ',wrk_pol.shape)
# storing as a clean csv for future eda
df_con.to_csv('../data/clean_df_con.csv')
df_pol.to_csv('../data/clean_df_pol.csv')
# will now merge these two dfs
df_main = wrk_pol.append(wrk_con,ignore_index=True)
df_main.head()


wrk_con columns:  (1000, 3)
wrk_pol columns:  (1000, 3)


Unnamed: 0,selftext,title,target
0,There are a lot of chauvinists in the U.S. who...,Is Kamala Harris more at risk of assassination...,0
1,Let me explain. The name of your movement shou...,"A push labeled ""Let mothers hold their babies""",0
2,I have read about some horrific pit bull attac...,Why don't we see politicians supporting pit bu...,0
3,Some international observers have argued that...,What threat does EU expansion pose towards Rus...,0
4,Every definition of the term implies cheating....,Why is gerrymandering legal?,0


---
Quick Cleaning for a Sentiment Analysis below will be used in random forest exploration notebook

In [15]:
df_main_cleaned = df_main.copy()

df_main_cleaned['selftext'].head()


0    There are a lot of chauvinists in the U.S. who...
1    Let me explain. The name of your movement shou...
2    I have read about some horrific pit bull attac...
3     Some international observers have argued that...
4    Every definition of the term implies cheating....
Name: selftext, dtype: object

In [25]:
# will overwrite the function removing the emoticons, line breaks('\n'), and the punctuation 
df_main_cleaned['selftext'] = df_main_cleaned['selftext'].apply(pre_process) 
df_main_cleaned['title'] = df_main_cleaned['title'].apply(pre_process)
df_main_cleaned.to_csv('../data/cleaned_main.csv',index=False)
df_main_cleaned.head()

Unnamed: 0,selftext,title,target
0,There are a lot of chauvinists in the U S who ...,Is Kamala Harris more at risk of assassination...,0
1,Let me explain The name of your movement shoul...,A push labeled Let mothers hold their babies,0
2,I have read about some horrific pit bull attac...,Why don t we see politicians supporting pit bu...,0
3,Some international observers have argued that ...,What threat does EU expansion pose towards Russia,0
4,Every definition of the term implies cheating ...,Why is gerrymandering legal,0


In [17]:
corpus = list(df_main_cleaned['selftext'])
corpus[:1]

['There are a lot of chauvinists in the U S who don t want to see a female president so I feel like Joe Biden is the least likely to be targeted president we ve had I would fear more for Harris s safety than Biden s since people wouldn t want her to be next in line']

In [18]:
#instantiate SIA
sia = SentimentIntensityAnalyzer()

#Get polarity scores for the first selftext post
sia.polarity_scores(corpus[1])

{'neg': 0.099, 'neu': 0.775, 'pos': 0.126, 'compound': 0.4756}

In [19]:
#create dataframe that contains text and the polarity scores for each tweet

sentiment = []    

for text in corpus:
    scores = sia.polarity_scores(text)
    scores['text'] = text
    sentiment.append(scores)

main_sent_self = pd.DataFrame(sentiment)
main_sent_self.head()

Unnamed: 0,neg,neu,pos,compound,text
0,0.059,0.787,0.154,0.501,There are a lot of chauvinists in the U S who ...
1,0.099,0.775,0.126,0.4756,Let me explain The name of your movement shoul...
2,0.235,0.729,0.035,-0.9297,I have read about some horrific pit bull attac...
3,0.108,0.743,0.149,0.3919,Some international observers have argued that ...
4,0.146,0.854,0.0,-0.5574,Every definition of the term implies cheating ...


In [22]:
# high positive sentiment scores 
main_sent_self.sort_values(by='pos',ascending=False).head()

Unnamed: 0,neg,neu,pos,compound,text
519,0.0,0.0,1.0,0.4404,Thanks
1310,0.0,0.0,1.0,0.3182,I m curious
1066,0.0,0.169,0.831,0.5994,God Bless them t s t s
1674,0.0,0.247,0.753,0.7269,This is pretty funny k
336,0.0,0.286,0.714,0.3612,Thank you


In [23]:
# lower positivity score 
main_sent_self.sort_values(by='pos',ascending=True).head()

Unnamed: 0,neg,neu,pos,compound,text
451,0.0,1.0,0.0,0.0,I don t get it
1196,0.0,1.0,0.0,0.0,Then I figured out because money laundering YAWN
1194,0.0,1.0,0.0,0.0,how to reverse it
1629,0.0,0.0,0.0,0.0,
694,0.0,1.0,0.0,0.0,Frankly I m starting to come to that conclusion


For the next stage in the process please move to the EDA notebook. 