# Reddit NLP Classification

### Import Libraries

In [42]:
import pandas as pd
import numpy as np
import scipy as stats

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import r2_score, confusion_matrix, roc_auc_score

import requests
import time
from bs4 import BeautifulSoup

import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import regex as re

### Read in dataframe

In [43]:
df = pd.read_pickle('./df.pkl')

In [44]:
df.head()

Unnamed: 0,id,title,author,created,selftext,url,subreddit
0,b98dv9,"Bibleman has been rebooted, and the villains o...",0,1554350000.0,,https://pureflix.com/series/267433510476/bible...,0
1,b9b45i,Roughly half of Americans think Christian nati...,0,1554370000.0,,https://www.lgbtqnation.com/2019/04/roughly-ha...,0
2,b9enrm,Anti-vaxxer ‘warrior mom’: If vaccines are so ...,0,1554390000.0,,http://deadstate.org/anti-vaxxer-warrior-mom-i...,0
3,b9dmqn,Megachurch preachers and their expensive sneak...,0,1554390000.0,,https://boingboing.net/2019/04/03/megachurch-p...,0
4,b95ydy,"Mormons say “Priesthood ban”, to describe thei...",0,1554340000.0,,https://www.dialoguejournal.com/wp-content/upl...,0


In [45]:
df.shape

(1914, 7)

## Classification

### Train-Test Split

In [46]:
X = df[['title', 'selftext', 'url']]
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify= y)

In [47]:
X.head()

Unnamed: 0,title,selftext,url
0,"Bibleman has been rebooted, and the villains o...",,https://pureflix.com/series/267433510476/bible...
1,Roughly half of Americans think Christian nati...,,https://www.lgbtqnation.com/2019/04/roughly-ha...
2,Anti-vaxxer ‘warrior mom’: If vaccines are so ...,,http://deadstate.org/anti-vaxxer-warrior-mom-i...
3,Megachurch preachers and their expensive sneak...,,https://boingboing.net/2019/04/03/megachurch-p...
4,"Mormons say “Priesthood ban”, to describe thei...",,https://www.dialoguejournal.com/wp-content/upl...


In [48]:
X_train.head()

Unnamed: 0,title,selftext,url
397,What bible should I get?,I have the New American Bible from the 80s and...,https://www.reddit.com/r/Catholicism/comments/...
37,Why Atheists Struggle To Come Out In Africa An...,,http://www.opinionnigeria.com/%EF%BB%BFwhy-ath...
953,Thank and Praise The Lord For All Things,The late Merlyn Carothers has written many boo...,https://www.reddit.com/r/Catholicism/comments/...
417,[Free Friday] Classic Wisconsin Friday night f...,,https://i.redd.it/tofpekn9i6p21.jpg
78,We made it : surviving cancer together,,https://www.thelifesource.org/all/we-made-it-s...


## Term-frequency Inverse Document-frequency (TF-IDF)

### Remove `stop_words`
- identify and remove additional `stop_words` that are not listed by default in `nltk stop_words` library, and other common words that may not help in classification.

In [49]:
my_stop_words = text.ENGLISH_STOP_WORDS.union([])

In [50]:
tvec = TfidfVectorizer(lowercase=True, tokenizer=None, max_features=2000)

In [51]:
X_train_tvec = tvec.fit_transform(X_train)
X_test_tvec = tvec.transform(X_test)

In [53]:
# Change sentence vectors into one array for dataframe
X_train_tvec = X_train_tvec.toarray()

In [54]:
X_train_tvec = pd.DataFrame(X_train_tvec, columns=tvec.get_feature_names())
X_train_tvec.head()

Unnamed: 0,selftext,title,url
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
