In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc, f1_score

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse

import datetime
import json
import requests
import time
from collections import defaultdict
import pickle

from pymongo import MongoClient, InsertOne, DeleteOne, ReplaceOne

In [2]:
incel_df = pd.read_csv('new_IncelTears_posts.csv')
slate_df = pd.read_csv('new_slatestarcodex_posts.csv')

In [5]:
client = MongoClient()
db = client["reddit"]
db.collection_names()
#db.create_collection("test_insert")
#test_collection = db.get_collection('test_insert')

#db.create_collection("titles")
#titles_collection = db.get_collection('titles')

['titles',
 'test_insert',
 'mycollection',
 'overnight_reddit',
 'reddit_overnight']

In [6]:
titles_collection = db.get_collection('titles')
overnight_reddit_collection = db.get_collection('overnight_reddit')
reddit_overnight_collection = db.get_collection('reddit_overnight') 
#list(titles_collection.find({'subreddit':'IncelTears', 'over_18':False}).limit(2))

In [62]:
with open('fit_vect.pickle', 'rb') as handle:
     vect_word = pickle.load(handle)
        
### IMPORTS PICKLED LR MODEL
with open('lr_model.pickle', 'rb') as handle:
     lr = pickle.load(handle)

In [75]:
def get_top_X_submittors_to_subreddit(subreddit, X): 
    subreddit_list = list(titles_collection.find({'subreddit':subreddit}))
    subreddit_df = pd.DataFrame(subreddit_list)
    subreddit_df = subreddit_df[subreddit_df['author'] != '[deleted]']
    top_X_subreddit_submittors = list(subreddit_df.groupby('author').count().sort_values(by=['_id'], ascending=False)[:X].index.values)
    return top_X_subreddit_submittors

In [119]:
def score_user(subreddit, user):
    """ 
    This function takes in a subreddit title and user, prints out their toxicity score
    and returns a lot of the process for further analysis.
    
    Right now what it returns are:
        toxic_percent = the users' score
        toxic_sample, safe sample = 10 sample text to eyeball the usefulness of the model
        
        These below should eventually be removed.
        user_probs = This is currently just appended into the function, eventually it should be pulled out.
        Right now what it returns is a predict_proba score instead of a 0,1 for the toxicity.
        user_submissions = This is all of the input data which helped me map the worst predict probas
        back to their titles to see what the worst predict proba's are. This definitely should also be separated 
        eventually.
    
    
    """
    #top_author = top_3_slate_submittors[0]
    user_submissions = list(titles_collection.find({'subreddit':subreddit, 'author':user}))
    user_text = np.array([i['title'] for i in user_submissions])
    user_vect = vect_word.transform(user_text)
    user_preds = lr.predict(user_vect)
    user_probs = lr.predict_proba(user_vect) 
    
    toxic_percent = user_preds.sum()/user_preds.shape[0]
    print(f'Percentage of {subreddit} user {user} titles predicted as toxic is {round(toxic_percent,2)*100}%')
    
    toxic_sample = user_text[np.isin(user_preds, 1)][:10] 
    safe_sample = user_text[np.isin(user_preds, 0)][:10] 

    return toxic_percent, toxic_sample, safe_sample, user_probs, user_submissions
    

In [103]:
slate_star_top_3 = get_top_X_submittors_to_subreddit('slatestarcodex', 3)
incel_tears_top_3 = get_top_X_submittors_to_subreddit('IncelTears', 3)

In [120]:
slate_top_3_scores_and_samples = [score_user('slatestarcodex',i) for i in slate_star_top_3]
incel_top_3_scores_and_samples = [score_user('IncelTears',i) for i in incel_tears_top_3]

Percentage of slatestarcodex user werttrew titles predicted as toxic is 5.0%
Percentage of slatestarcodex user gwern titles predicted as toxic is 6.0%
Percentage of slatestarcodex user dwaxe titles predicted as toxic is 5.0%
Percentage of IncelTears user RidingChad titles predicted as toxic is 35.0%
Percentage of IncelTears user BrazilianSigma titles predicted as toxic is 37.0%
Percentage of IncelTears user caspertruth666 titles predicted as toxic is 31.0%


## Predict Proba

In [135]:
#top users' predict proba's sorted by the ones most likely to be toxic listing top 5. - SLATE
user_predict_proba = [(idx, value) for idx, value in enumerate(slate_top_3_scores_and_samples[0][3])]
check_5_highest_toxicity = sorted(user_predict_proba, reverse=True, key=lambda x: x[1][1])[:5]
five_highest_text = [slate_top_3_scores_and_samples[0][4][i[0]]['title'] for i in check_5_highest_toxicity]
five_highest_text

In [138]:
#top users' predict proba's sorted by the ones most likely to be toxic listing top 5. - INCEL
user_predict_proba = [(idx, value) for idx, value in enumerate(incel_top_3_scores_and_samples[0][3])]
check_5_highest_toxicity = sorted(user_predict_proba, reverse=True, key=lambda x: x[1][1])[:5]
five_highest_text = [slate_top_3_scores_and_samples[0][4][i[0]]['title'] for i in check_5_highest_toxicity]
five_highest_text