In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import math

  import pandas.util.testing as tm


In [0]:
from scipy.stats import zscore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline

In [0]:
## code for call reddit API from one of reddit post

PAGE_COUNT = 5
RANDOM_STATE = 0
TEST_SIZE = 0.25

def get_reddit_data(page_count):
    print('fetching your data, * = 1 request to the Reddit API')
    headers = {'User-Agent': 'Predicting Reddit Post Metadata'}
    posts = []
    top_subreddits = [ 'r/funny', 'r/gaming', 'r/pics',
            'r/aww', 'r/science', 'r/worldnews', 'r/Music',
            'r/movies', 'r/todayilearned', 'r/videos'
    ]
    for i in range(len(top_subreddits)):
        after = ''
        for j in range(page_count):
            print('*', end='')
            url = 'https://www.reddit.com/'+ top_subreddits[i] + '/top.json?t=all&after=' + after
            #print(url)
            response = requests.get(url, headers=headers)    
            for k in range(len(response.json()['data']['children'])):
                post = {}
                post['created_utc'] = int(response.json()['data']['children'][k]['data']['created_utc'])
                post['is_video'] = int(response.json()['data']['children'][k]['data']['is_video'])
                post['subreddit'] = response.json()['data']['children'][k]['data']['subreddit']
                post['title'] = response.json()['data']['children'][k]['data']['title']
                post['total_awards_received'] = response.json()['data']['children'][k]['data']['total_awards_received']
                post['ups'] = response.json()['data']['children'][k]['data']['ups']
                posts.append(post)
            after = response.json()['data']['after']    
    return posts

columns = ['created_utc', 'is_video', 'subreddit', 'title', 'total_awards_received', 'ups']
df = pd.DataFrame(get_reddit_data(PAGE_COUNT), columns=columns)
df = df.sample(frac=1, random_state=RANDOM_STATE)

fetching your data, * = 1 request to the Reddit API
**************************************************

In [0]:
print('df.Describe():\n', df.describe(), '\n')
print(df.info(), '\n')
print('df.head(10):\n', df.head(10), '\n')
print('SUM OF NA VALUES:\n', df.isna().sum(), '\n')

df.Describe():
         created_utc     is_video  total_awards_received            ups
count  1.250000e+03  1250.000000            1250.000000    1250.000000
mean   1.543572e+09     0.070400              11.338400  115126.575200
std    3.164753e+07     0.255922              35.943264   42644.825878
min    1.407798e+09     0.000000               0.000000   36094.000000
25%    1.517517e+09     0.000000               1.000000   79495.000000
50%    1.546135e+09     0.000000               3.000000  115756.500000
75%    1.571157e+09     0.000000              10.000000  142495.500000
max    1.590590e+09     1.000000             571.000000  349296.000000 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1250 entries, 711 to 684
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   created_utc            1250 non-null   int64 
 1   is_video               1250 non-null   int64 
 2   subreddit              1250

In [0]:
df.head()

Unnamed: 0,created_utc,is_video,subreddit,title,total_awards_received,ups
711,1582481896,0,worldnews,The family of a British teenager killed in a r...,9,105738
898,1482861138,0,movies,Carrie Fisher dead at age 60,0,96676
186,1549472005,0,gaming,"Chess counts, right?",7,131843
867,1512818266,0,Music,Steven Tyler opens home for abused girls,0,37232
18,1584351834,0,funny,Experts recommend keeping your daily rituals e...,23,181365


In [0]:
X = df['title'].values
y = df['subreddit'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=0)

In [0]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((937,), (313,), (937,), (313,))

In [0]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('multinomialnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [0]:
y_predict = model.predict(X_test)
accuracy = (y_predict == y_test).mean()
baseline = pd.Series(y_test).value_counts()[0] / pd.Series(y_test).value_counts().sum()
prediction_count = pd.Series(y_test).value_counts().sum()

In [17]:
print('VC of y_train')
print(pd.Series(y_train).value_counts()[:10], '\n')
print('VC of y_test')
print(pd.Series(y_test).value_counts()[:10], '\n')
print('VC of y_predict')
print(pd.Series(y_predict).value_counts()[:10], '\n')
print('# PRDCTN: ', prediction_count)
print('BASELINE: ', baseline)
print('ACCURACY: ', accuracy)
print(len(y_test), len(y_predict))       

VC of y_train
pics             99
movies           99
videos           97
Music            97
gaming           95
worldnews        94
aww              92
todayilearned    91
science          88
funny            85
dtype: int64 

VC of y_test
funny            40
science          37
todayilearned    34
aww              33
worldnews        31
gaming           30
Music            28
videos           28
pics             26
movies           26
dtype: int64 

VC of y_predict
todayilearned    74
pics             50
science          47
aww              35
movies           35
Music            21
worldnews        21
gaming           16
videos           11
funny             3
dtype: int64 

# PRDCTN:  313
BASELINE:  0.12779552715654952
ACCURACY:  0.5175718849840255
313 313


In [0]:
import pickle

In [0]:
pickle.dump(model, open( "model.pkl", "wb" ) )

In [0]:
def reddit(inputs):
    model = pickle.load(open("/content/model.pkl", "rb"))
    return model.predict([inputs])[0]

In [45]:
reddit('You shall not pass here')

'gaming'