# Subreddit Classification with Natural Language Processing

## Data Gathering

*Author: Grace Campbell*

#### Project Directory
1. Data Preparation 
    - *Data Gathering*
    - [Exploratory Data Analysis](https://github.com/GraceCampbell/Fake-News-Classification-NLP/blob/master/exploratory-data-analysis.ipynb)
2. Modeling
    - [Naive Bayes](https://github.com/GraceCampbell/Fake-News-Classification-NLP/blob/master/modeling-naive-bayes.ipynb)
    - [$k$-Nearest Neighbors](https://github.com/GraceCampbell/Fake-News-Classification-NLP/blob/master/modeling-knn.ipynb)
    - [Support-Vector Machine](https://github.com/GraceCampbell/Fake-News-Classification-NLP/blob/master/modeling-svm.ipynb)

In [None]:
import requests
import time
import pandas as pd
import numpy as np

In [None]:
headers = {'User-agent': 'Grace'}

### Getting /r/TheOnion post titles

In [None]:
# Web scraping Reddit's API for titles of 1000 hot posts from /r/TheOnion
onion_posts = []
after = None
for i in range(40):
    if after == None:
        params = {}
    else:
        params = {'after': after}
    url = 'https://www.reddit.com/r/theonion.json'
    res = requests.get(url, params=params, headers=headers)
    if res.status_code == 200:
        the_json = res.json()
        onion_posts.extend(the_json['data']['children'])
        after = the_json['data']['after']
    else:
        print(res.status_code)
        break
    time.sleep(2)

In [None]:
titles = []
for i in range(len(onion_posts)):
    titles.append(onion_posts[i]['data']['title'])

In [None]:
onion_titles = list((set(titles)))
len(onion_titles)

### Getting /r/News post titles

In [None]:
# Web scraping Reddit's API for titles of 1000 hot posts from /r/News
news_posts = []
after = None
for i in range(40):
    if after == None:
        params = {}
    else:
        params = {'after': after}
    url = 'https://www.reddit.com/r/worldnews.json'
    res = requests.get(url, params=params, headers=headers)
    if res.status_code == 200:
        the_json = res.json()
        news_posts.extend(the_json['data']['children'])
        after = the_json['data']['after']
    else:
        print(res.status_code)
        break
    time.sleep(2)

In [None]:
titles = []
for i in range(len(news_posts)):
    titles.append(news_posts[i]['data']['title'])

In [None]:
news_titles = list(set(titles))
len(news_titles)

### Creating a dataframe of titles and their respective class

In [None]:
onion = pd.DataFrame(onion_titles)
onion['is_onion'] = 1

news = pd.DataFrame(news_titles)
news['is_onion'] = 0

titles = news.append(onion, ignore_index=True)
titles.rename({0: 'title'}, axis=1, inplace=True)

### Saving dataframe to .csv

In [None]:
titles.to_csv('titles.csv', index=False)