In [None]:
import requests
import time
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

from tokenizer import token_func

In [None]:
headers = {'User-agent': 'Grace'}

def get_posts():
    onion_posts = []
    after = None
    for i in range(40):
        if after == None:
            params = {}
        else:
            params = {'after': after}
        url = 'https://www.reddit.com/r/theonion.json'
        res = requests.get(url, params=params, headers=headers)
        if res.status_code == 200:
            the_json = res.json()
            onion_posts.extend(the_json['data']['children'])
            after = the_json['data']['after']
        else:
            print(res.status_code)
            break
        time.sleep(2)

    titles = []
    for i in range(len(onion_posts)):
        titles.append(onion_posts[i]['data']['title'])

    onion_titles = list((set(titles)))

    news_posts = []
    after = None
    for i in range(40):
        if after == None:
            params = {}
        else:
            params = {'after': after}
        url = 'https://www.reddit.com/r/worldnews.json'
        res = requests.get(url, params=params, headers=headers)
        if res.status_code == 200:
            the_json = res.json()
            news_posts.extend(the_json['data']['children'])
            after = the_json['data']['after']
        else:
            print(res.status_code)
            break
        time.sleep(2)

    titles = []
    for i in range(len(news_posts)):
        titles.append(news_posts[i]['data']['title'])

    news_titles = list(set(titles))

    onion = pd.DataFrame(onion_titles)
    onion['is_onion'] = 1

    news = pd.DataFrame(news_titles)
    news['is_onion'] = 0

    titles = news.append(onion, ignore_index=True)
    titles.rename({0: 'title'}, axis=1, inplace=True)
    
    return titles

In [None]:
def naive_bayes(X, y):
    df = pd.read_csv('./materials/titles.csv')

    X_og = df['title']
    y_og = df['is_onion']
    
    cvec = CountVectorizer(tokenizer=token_func, max_features=X_og.shape[0], min_df=1, max_df=0.9)
    cvec.fit(X_og)
    
    X_og_cvec = pd.DataFrame(cvec.transform(X_og).todense(), columns=cvec.get_feature_names())
    X_cvec    = pd.DataFrame(cvec.transform(X).todense(), columns=cvec.get_feature_names())
    
    mnb = MultinomialNB(alpha=1)
    mnb.fit(X_og_cvec, y)
    print('Accuracy score: {mnb.score(X_cvec, y)}')
    
    y_pred = mnb.predict(X_cvec)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    print(f'Sensitivity: {tp/(tp+fn)}')
    print(f'Specificity: {tn/(tn+fp)}')

In [None]:
def knn(X, y):
    df = pd.read_csv('./materials/titles.csv')

    X_og = df['title']
    y_og = df['is_onion']
    
    tvec = TfidfVectorizer(tokenizer=token_func, max_features=X_og.shape[0], min_df=1, max_df=0.9)
    tvec.fit(X_og)
    
    X_og_tvec = pd.DataFrame(cvec.transform(X_og).todense(), columns=tvec.get_feature_names())
    X_tvec    = pd.DataFrame(cvec.transform(X).todense(), columns=tvec.get_feature_names())
    
    knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
    knn.fit(X_og_tvec, y)
    print('Accuracy score: {knn.score(X_tvec, y)}')
    
    y_pred = knn.predict(X_tvec)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    print(f'Sensitivity: {tp/(tp+fn)}')
    print(f'Specificity: {tn/(tn+fp)}')

In [None]:
def svc(X, y):
    df = pd.read_csv('./materials/titles.csv')

    X_og = df['title']
    y_og = df['is_onion']
    
    tvec = TfidfVectorizer(tokenizer=token_func, max_features=X_og.shape[0], min_df=2, max_df=0.9)
    tvec.fit(X_og)
    
    X_og_tvec = pd.DataFrame(cvec.transform(X_og).todense(), columns=tvec.get_feature_names())
    X_tvec    = pd.DataFrame(cvec.transform(X).todense(), columns=tvec.get_feature_names())
    
    svc = SVC(kernel='rbf', C=10)
    svc.fit(X_og_tvec, y)
    print('Accuracy score: {svc.score(X_tvec, y)}')
    
    y_pred = svc.predict(X_tvec)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    print(f'Sensitivity: {tp/(tp+fn)}')
    print(f'Specificity: {tn/(tn+fp)}')