In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc, f1_score

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse

import datetime
import json
import requests
import time
from collections import defaultdict
import pickle

from pymongo import MongoClient, InsertOne, DeleteOne, ReplaceOne

In [2]:
incel_df = pd.read_csv('new_IncelTears_posts.csv')
slate_df = pd.read_csv('new_slatestarcodex_posts.csv')

In [3]:
client = MongoClient()
db = client["reddit"]
db.collection_names()
#db.create_collection("test_insert")
#test_collection = db.get_collection('test_insert')

#db.create_collection("titles")
#titles_collection = db.get_collection('titles')

['titles',
 'test_insert',
 'mycollection',
 'overnight_reddit',
 'reddit_overnight']

In [4]:
titles_collection = db.get_collection('titles')
overnight_reddit_collection = db.get_collection('overnight_reddit')
reddit_overnight_collection = db.get_collection('reddit_overnight') 
#list(titles_collection.find({'subreddit':'IncelTears', 'over_18':False}).limit(2))

In [6]:
with open('fit_undersampled_vect.pickle', 'rb') as handle:
     vect_word = pickle.load(handle)
        
### IMPORTS PICKLED LR MODEL
with open('lr_undersampled_model.pickle', 'rb') as handle:
     lr = pickle.load(handle)

In [7]:
def get_top_X_submittors_to_subreddit(subreddit, X): 
    subreddit_list = list(titles_collection.find({'subreddit':subreddit}))
    subreddit_df = pd.DataFrame(subreddit_list)
    subreddit_df = subreddit_df[subreddit_df['author'] != '[deleted]']
    top_X_subreddit_submittors = list(subreddit_df.groupby('author').count().sort_values(by=['_id'], ascending=False)[:X].index.values)
    return top_X_subreddit_submittors

In [8]:
def score_user(subreddit, user):
    """ 
    This function takes in a subreddit title and user, prints out their toxicity score
    and returns a lot of the process for further analysis.
    
    Right now what it returns are:
        toxic_percent = the users' score
        toxic_sample, safe sample = 10 sample text to eyeball the usefulness of the model
        
        These below should eventually be removed.
        user_probs = This is currently just appended into the function, eventually it should be pulled out.
        Right now what it returns is a predict_proba score instead of a 0,1 for the toxicity.
        user_submissions = This is all of the input data which helped me map the worst predict probas
        back to their titles to see what the worst predict proba's are. This definitely should also be separated 
        eventually.
    
    
    """
    #top_author = top_3_slate_submittors[0]
    user_submissions = list(titles_collection.find({'subreddit':subreddit, 'author':user}))
    user_text = np.array([i['title'] for i in user_submissions])
    user_vect = vect_word.transform(user_text)
    user_preds = lr.predict(user_vect)
    user_probs = lr.predict_proba(user_vect) 
    
    toxic_percent = user_preds.sum()/user_preds.shape[0]
    print(f'Percentage of {subreddit} user {user} titles predicted as toxic is {round(toxic_percent,2)*100}%')
    
    toxic_sample = user_text[np.isin(user_preds, 1)][:10] 
    safe_sample = user_text[np.isin(user_preds, 0)][:10] 

    return toxic_percent, toxic_sample, safe_sample, user_probs, user_submissions
    

In [9]:
slate_star_top_3 = get_top_X_submittors_to_subreddit('slatestarcodex', 3)
incel_tears_top_3 = get_top_X_submittors_to_subreddit('IncelTears', 3)

In [10]:
slate_top_3_scores_and_samples = [score_user('slatestarcodex',i) for i in slate_star_top_3]
incel_top_3_scores_and_samples = [score_user('IncelTears',i) for i in incel_tears_top_3]

Percentage of slatestarcodex user werttrew titles predicted as toxic is 7.000000000000001%
Percentage of slatestarcodex user gwern titles predicted as toxic is 10.0%
Percentage of slatestarcodex user dwaxe titles predicted as toxic is 5.0%
Percentage of IncelTears user RidingChad titles predicted as toxic is 46.0%
Percentage of IncelTears user BrazilianSigma titles predicted as toxic is 47.0%
Percentage of IncelTears user caspertruth666 titles predicted as toxic is 41.0%


## Predict Proba

In [11]:
#top users' predict proba's sorted by the ones most likely to be toxic listing top 5. - SLATE
user_predict_proba = [(idx, value) for idx, value in enumerate(slate_top_3_scores_and_samples[0][3])]
check_5_highest_toxicity = sorted(user_predict_proba, reverse=True, key=lambda x: x[1][1])[:5]
five_highest_text = [slate_top_3_scores_and_samples[0][4][i[0]]['title'] for i in check_5_highest_toxicity]
five_highest_text



['We are all confident idiots',
 'INTERLUDE ו: THERE’S A HOLE IN MY BUCKET (Unsong)',
 '"A new definition of the nerd: a person who knows his own mind well enough to mistrust it"',
 'Why Are Babies So Dumb If Humans Are So Smart? (The New Yorker)',
 '“The Suck Fairy”: when you reread a beloved book and it loses its charm for you (2010)']

In [15]:
incel_top_3_scores_and_samples[0][1]


array(['Incel tells rape victim to stop using it as a crutch',
       'Women dress like sluts (how dare they) and men made them',
       'Woman compliments Incel in front of her husband. How dare she?',
       "Pedocel thinks mother's are jealous not protective of their daughters",
       'Paging Neve and Max - Incels just love that catfishing',
       'Wishing suffering and death penalty on Mom',
       'Incel asked single mom to kill her child for him',
       "Incel rants about his sister's sex life, other Incels request pictures",
       'No, Filipino women are not going to put up with your personality either',
       'Peepingtomcel wants others to die in a fire'], dtype='<U169')

In [24]:
lr.coef_

array([[ 0.37115451, -0.71185306,  0.19783052, ...,  0.90005951,
        -0.04174385, -0.03246744]])

In [19]:
#top users' predict proba's sorted by the ones most likely to be toxic listing top 5. - INCEL
user_predict_proba = [(idx, value) for idx, value in enumerate(incel_top_3_scores_and_samples[0][3])]
check_5_highest_toxicity = sorted(user_predict_proba, reverse=True, key=lambda x: x[1][1])[:20]
five_highest_text = [slate_top_3_scores_and_samples[0][4][i[0]]['title'] for i in check_5_highest_toxicity]
# five_highest_text
check_5_highest_toxicity


[(138, array([2.42065677e-04, 9.99757934e-01])),
 (113, array([0.00557832, 0.99442168])),
 (174, array([0.00776108, 0.99223892])),
 (288, array([0.00947313, 0.99052687])),
 (202, array([0.01537138, 0.98462862])),
 (771, array([0.01684106, 0.98315894])),
 (551, array([0.0172956, 0.9827044])),
 (601, array([0.01889933, 0.98110067])),
 (230, array([0.02240764, 0.97759236])),
 (69, array([0.02244421, 0.97755579])),
 (613, array([0.02268242, 0.97731758])),
 (513, array([0.02583673, 0.97416327])),
 (32, array([0.02737394, 0.97262606])),
 (239, array([0.02925115, 0.97074885])),
 (735, array([0.03362948, 0.96637052])),
 (632, array([0.0414727, 0.9585273])),
 (559, array([0.04338125, 0.95661875])),
 (704, array([0.04880665, 0.95119335])),
 (9, array([0.05187268, 0.94812732])),
 (683, array([0.05321626, 0.94678374]))]