In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc, f1_score

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse

import datetime
from datetime import datetime
import json
import requests
import time
from collections import defaultdict
import pickle

from pymongo import MongoClient, InsertOne, DeleteOne, ReplaceOne



## Connect to Mongo & Load models

In [2]:
client = MongoClient()
db = client["reddit"]
titles_collection = db.get_collection('titles')
overnight_reddit_collection = db.get_collection('overnight_reddit')
reddit_overnight_collection = db.get_collection('reddit_overnight') 
#list(titles_collection.find({'subreddit':'IncelTears', 'over_18':False}).limit(2))

with open('fit_undersampled_vect.pickle', 'rb') as handle:
     vect_word = pickle.load(handle)
        
### IMPORTS PICKLED LR MODEL
with open('lr_undersampled_model.pickle', 'rb') as handle:
     lr = pickle.load(handle)

## Helper functions

In [6]:
def get_top_X_submittors_to_subreddit(subreddit, X): 
    subreddit_list = list(titles_collection.find({'subreddit':subreddit}))
    subreddit_df = pd.DataFrame(subreddit_list)
    subreddit_df = subreddit_df[subreddit_df['author'] != '[deleted]']
    top_X_subreddit_submittors = list(subreddit_df.groupby('author').count().sort_values(by=['_id'], ascending=False)[:X].index.values)
    return top_X_subreddit_submittors

In [7]:
def score_user(subreddit, user):
    """ 
    This function takes in a subreddit title and user, prints out their toxicity score
    and returns a lot of the process for further analysis.
    
    Right now what it returns are:
        toxic_percent = the users' score
        toxic_sample, safe sample = 10 sample text to eyeball the usefulness of the model
        
        These below should eventually be removed.
        user_probs = This is currently just appended into the function, eventually it should be pulled out.
        Right now what it returns is a predict_proba score instead of a 0,1 for the toxicity.
        user_submissions = This is all of the input data which helped me map the worst predict probas
        back to their titles to see what the worst predict proba's are. This definitely should also be separated 
        eventually.
    
    
    """
    #top_author = top_3_slate_submittors[0]
    user_submissions = list(titles_collection.find({'subreddit':subreddit, 'author':user}))
    user_text = np.array([i['title'] for i in user_submissions])
    user_vect = vect_word.transform(user_text)
    user_preds = lr.predict(user_vect)
    user_probs = lr.predict_proba(user_vect) 
    
    toxic_percent = user_preds.sum()/user_preds.shape[0]
    print(f'Percentage of {subreddit} user {user} titles predicted as toxic is {round(toxic_percent,2)*100}%')
    
    toxic_sample = user_text[np.isin(user_preds, 1)][:10] 
    safe_sample = user_text[np.isin(user_preds, 0)][:10] 

    return toxic_percent, toxic_sample, safe_sample, user_probs, user_submissions
    

In [8]:
slate_star_top_3 = get_top_X_submittors_to_subreddit('slatestarcodex', 3)
incel_tears_top_3 = get_top_X_submittors_to_subreddit('IncelTears', 3)
donald_top_3 = get_top_X_submittors_to_subreddit('The_Donald',3)
slate_top_3_scores_and_samples = [score_user('slatestarcodex',i) for i in slate_star_top_3]
incel_top_3_scores_and_samples = [score_user('IncelTears',i) for i in incel_tears_top_3]

Percentage of slatestarcodex user werttrew titles predicted as toxic is 7.000000000000001%
Percentage of slatestarcodex user gwern titles predicted as toxic is 10.0%
Percentage of slatestarcodex user dwaxe titles predicted as toxic is 5.0%
Percentage of IncelTears user RidingChad titles predicted as toxic is 46.0%
Percentage of IncelTears user BrazilianSigma titles predicted as toxic is 47.0%
Percentage of IncelTears user caspertruth666 titles predicted as toxic is 41.0%


## Predict Probability of toxicity to start thinking about banning criteria

In [9]:
def users_most_toxic_submissions(subreddit, user, n_submissions):
    toxic_percent, toxic_sample, safe_sample, user_probs, user_submissions = score_user(subreddit, user)
    idx_value_probs = [(idx, value) for idx, value in enumerate(user_probs)]
    check_5_highest_toxicity = sorted(idx_value_probs, reverse=True, key=lambda x: x[1][1])[:n_submissions]
    
    #add time stamps into the return for future time based toxicity analysis
    n_highest_proba_time_and_text = [(i[1][1], 
        datetime.utcfromtimestamp(user_submissions[i[0]]['created_utc']).strftime('%Y-%m-%d %H:%M:%S'), 
        user_submissions[i[0]]['title']) for i in check_5_highest_toxicity]
    n_highest_proba_and_text = list(zip(n_highest_proba_time_and_text[0], n_highest_proba_time_and_text[2]))
    n_highest_text = [i[1] for i in n_highest_proba_and_text]
    return n_highest_proba_and_text

In [10]:
users_most_toxic_submissions('slatestarcodex', slate_star_top_3[0], 10)

Percentage of slatestarcodex user werttrew titles predicted as toxic is 7.000000000000001%


[(0.9088324983834649, 0.8276116381940869),
 ('2015-11-27 22:51:41', '2017-12-03 15:48:25'),
 ('We are all confident idiots',
  '"A new definition of the nerd: a person who knows his own mind well enough to mistrust it"')]

In [11]:
users_most_toxic_submissions('IncelTears', incel_tears_top_3[0], 10)

Percentage of IncelTears user RidingChad titles predicted as toxic is 46.0%


[(0.9997579343233027, 0.9922389196878674),
 ('2017-06-14 02:39:21', '2017-06-21 02:16:04'),
 ("Incel's - Fuck the Jews", 'Incel hates gay people')]

In [12]:
users_most_toxic_submissions('The_Donald', donald_top_3[0], 10)

Percentage of The_Donald user VoteForTrump2016 titles predicted as toxic is 11.0%


[(0.9302587659678289, 0.8140475862380802),
 ('2015-12-29 21:01:57', '2015-12-20 20:22:08'),
 ("Donald Trump's supporters are not racist – they are sick of being let down",
  'Trump: Fellow Republicans are ‘jealous as hell’ of Putin’s praise')]

## Toxicity over time

In [13]:
toxic_percent, toxic_sample, safe_sample, user_probs, user_submissions = score_user('slatestarcodex', slate_star_top_3[0])


Percentage of slatestarcodex user werttrew titles predicted as toxic is 7.000000000000001%


## Consider graphing over time

This code below needs revisiting

In [15]:
def comments_by_time(subreddit, user):
    toxic_percent, toxic_sample, safe_sample, user_probs, user_submissions = score_user(subreddit, user)

    idx_value_probs = [(idx, value) for idx, value in enumerate(user_probs)]
    check_5_highest_toxicity = sorted(idx_value_probs, reverse=True, key=lambda x: x[1][1])[:10] #remove this
    
    #add time stamps into the return for future time based toxicity analysis
    n_highest_proba_time_and_text = [(i[1][1], 
        datetime.utcfromtimestamp(user_submissions[i[0]]['created_utc']).strftime('%Y-%m-%d %H:%M:%S'), 
        user_submissions[i[0]]['title']) for i in check_5_highest_toxicity]
    n_highest_probab_and_text = list(zip(n_highest_proba_time_and_text[0], n_highest_proba_time_and_text[2]))
    n_highest_text = [i[1] for i in n_highest_proba_and_text]
    return n_highest_proba_and_text

In [14]:
# time,title = user_submissions[4]['created_utc'], user_submissions[4]['title']

# datetime.utcfromtimestamp(user_submissions[i[0]]['created_utc']).strftime('%Y-%m-%d %H:%M:%S'), 

# user_submissions[4]['created_utc']

NameError: name 'i' is not defined