In [None]:
import torch
import pandas as pd
!pip install transformers
from transformers import BertTokenizer
import sys, time, datetime, random
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import numpy as np

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import random
import seaborn as sns



In [None]:
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os 

redditors = np.load('/content/drive/MyDrive/controversy/controversy_data/unlabeled_data/user_information.npy', allow_pickle=True).item()
print(len(redditors)) 
redditors['UkraineClownPosse']

survival_data_fname = '/content/drive/MyDrive/controversy/controversy_data/unlabeled_data/survival_data_more.pickle'
if os.path.isfile(survival_data_fname):
    survival_data = pd.read_pickle(survival_data_fname)
else:
  survival_data = pd.DataFrame.from_dict(redditors, orient='index').reset_index()
  survival_data.columns=['author', 'tenure_timestamp', 'is_mod', 'comment_karma']
  survival_data = survival_data.dropna()
  survival_data.to_pickle('/content/drive/MyDrive/controversy/controversy_data/unlabeled_data/survival_data.pickle')
survival_data

237330


Unnamed: 0,author,tenure_timestamp,is_mod,comment_karma,total_count,left_score,left_count,right_score,right_count
105132,IV-IVm-I,1.420902e+09,False,22359.0,6.0,6.0,6.0,0.0,0.0
105133,damejudyclench,1.548133e+09,False,3897.0,4.0,25.0,3.0,0.0,0.0
105134,aragorn841,1.388695e+09,False,254.0,4.0,-17.0,4.0,0.0,0.0
105136,stupidhoes,1.324225e+09,False,12180.0,8.0,18.0,8.0,0.0,0.0
105137,TimeBreakerBaba,1.548404e+09,False,2766.0,1.0,2.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
237325,DevSammy,1.542643e+09,True,2304.0,0.0,0.0,0.0,0.0,0.0
237326,Biguwuiscute,1.551404e+09,False,9142.0,0.0,0.0,0.0,0.0,0.0
237327,TayTai,1.439551e+09,False,349.0,0.0,0.0,0.0,0.0,0.0
237328,Empty_Sink_4597,1.603176e+09,False,2331.0,7.0,1.0,7.0,0.0,0.0


In [None]:
survival_data_more = pd.DataFrame.from_dict(redditors, orient='index').reset_index()
survival_data_more.columns=['author', 'tenure_timestamp', 'is_mod', 'comment_karma']
survival_data_more = survival_data_more.dropna()
survival_data_more = survival_data_more[~survival_data_more.author.isin(survival_data.author)]
survival_data_more.to_pickle('/content/drive/MyDrive/controversy/controversy_data/unlabeled_data/survival_data_more.pickle')
survival_data_more

Unnamed: 0,author,tenure_timestamp,is_mod,comment_karma
105132,IV-IVm-I,1.420902e+09,False,22359.0
105133,damejudyclench,1.548133e+09,False,3897.0
105134,aragorn841,1.388695e+09,False,254.0
105136,stupidhoes,1.324225e+09,False,12180.0
105137,TimeBreakerBaba,1.548404e+09,False,2766.0
...,...,...,...,...
237325,DevSammy,1.542643e+09,True,2304.0
237326,Biguwuiscute,1.551404e+09,False,9142.0
237327,TayTai,1.439551e+09,False,349.0
237328,Empty_Sink_4597,1.603176e+09,False,2331.0


In [None]:
survival_data = survival_data_more

In [None]:
survival_data = pd.concat([survival_data,pd.DataFrame(columns=['total_count', 'left_score', 'left_count', 'right_score', 'right_count'])])
survival_data

Unnamed: 0,author,tenure_timestamp,is_mod,comment_karma,total_count,left_score,left_count,right_score,right_count
105132,IV-IVm-I,1.420902e+09,False,22359.0,,,,,
105133,damejudyclench,1.548133e+09,False,3897.0,,,,,
105134,aragorn841,1.388695e+09,False,254.0,,,,,
105136,stupidhoes,1.324225e+09,False,12180.0,,,,,
105137,TimeBreakerBaba,1.548404e+09,False,2766.0,,,,,
...,...,...,...,...,...,...,...,...,...
237325,DevSammy,1.542643e+09,True,2304.0,,,,,
237326,Biguwuiscute,1.551404e+09,False,9142.0,,,,,
237327,TayTai,1.439551e+09,False,349.0,,,,,
237328,Empty_Sink_4597,1.603176e+09,False,2331.0,,,,,


In [None]:
left_sub = ['politics', 'Libertarian', 'Political_Revolution', 'VoteBlue', 'VoteDEM', 'hillaryclinton', 'progressive']
right_sub = ['donaldtrump', 'ConservativesOnly', 'Conservative']

In [None]:
categories = ['vulgarity', 'civility', 'namecalling', 'stereotype', 'demeaning']
fnames = ['20200201_20200528', '20200528_20200713', '20200713_20200813', '20200813_20201025', '20201025_20201128', '20201129_20210201']

def read_comments_df(fname):
  f_name = '/content/drive/MyDrive/controversy/controversy_data/unlabeled_data/comments/{}_all_comments.pickle'.format(fname)
  comments = pd.read_pickle(f_name)
  comments = comments.dropna(subset=['body', 'id'])
  comments = comments[(comments['author'] != '[deleted]') & (comments['author'] != '[removed]') & (comments['body'] != '[deleted]') & (comments['body'] != '[removed]')]
  return comments

for i in range(0,2):
  name = fnames[i]
  comments = read_comments_df(name)

  print("{} unique authors wrote {} comments in {}".format(len(set(comments.author)), len(comments), name))
  left_comment_groups = comments[comments.subreddit.isin(left_sub)].groupby('author')
  right_comment_groups = comments[comments.subreddit.isin(right_sub)].groupby('author')
  all_groups = comments.groupby('author')
  del comments
  survival_data['total_count'] = survival_data['author'].map(dict(all_groups['id'].count())).fillna(0) + survival_data['total_count'].fillna(0)
  survival_data['left_score'] = survival_data['author'].map(dict(left_comment_groups['score'].sum())).fillna(0) + survival_data['left_score'].fillna(0)
  survival_data['right_score'] = survival_data['author'].map(dict(right_comment_groups['score'].sum())).fillna(0) + survival_data['right_score'].fillna(0)
  survival_data['left_count'] = survival_data['author'].map(dict(left_comment_groups['id'].count())).fillna(0) + survival_data['left_count'].fillna(0)
  survival_data['right_count'] = survival_data['author'].map(dict(right_comment_groups['id'].count())).fillna(0) + survival_data['right_count'].fillna(0)
  del all_groups, left_comment_groups, right_comment_groups
  survival_data.to_pickle('/content/drive/MyDrive/controversy/controversy_data/unlabeled_data/survival_data_more.pickle')
  survival_data

740839 unique authors wrote 12326066 comments in 20200201_20200528
429662 unique authors wrote 4127977 comments in 20200528_20200713


In [None]:
survival_data[survival_data.total_count.notnull()]

Unnamed: 0,author,tenure_timestamp,is_mod,comment_karma,total_count,left_score,left_count,right_score,right_count
105132,IV-IVm-I,1.420902e+09,False,22359.0,76.0,71.0,71.0,40.0,4.0
105133,damejudyclench,1.548133e+09,False,3897.0,20.0,71.0,12.0,0.0,0.0
105134,aragorn841,1.388695e+09,False,254.0,4.0,-17.0,4.0,0.0,0.0
105136,stupidhoes,1.324225e+09,False,12180.0,8.0,18.0,8.0,0.0,0.0
105137,TimeBreakerBaba,1.548404e+09,False,2766.0,1.0,2.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
237325,DevSammy,1.542643e+09,True,2304.0,0.0,0.0,0.0,0.0,0.0
237326,Biguwuiscute,1.551404e+09,False,9142.0,22.0,22.0,22.0,0.0,0.0
237327,TayTai,1.439551e+09,False,349.0,0.0,0.0,0.0,0.0,0.0
237328,Empty_Sink_4597,1.603176e+09,False,2331.0,7.0,1.0,7.0,0.0,0.0


In [None]:
survival_data[survival_data.left_count.notnull()]

Unnamed: 0,author,tenure_timestamp,is_mod,comment_karma,total_count,left_score,left_count,right_score,right_count
105132,IV-IVm-I,1.420902e+09,False,22359.0,93.0,86.0,86.0,41.0,5.0
105133,damejudyclench,1.548133e+09,False,3897.0,58.0,101.0,35.0,0.0,0.0
105134,aragorn841,1.388695e+09,False,254.0,4.0,-17.0,4.0,0.0,0.0
105136,stupidhoes,1.324225e+09,False,12180.0,8.0,18.0,8.0,0.0,0.0
105137,TimeBreakerBaba,1.548404e+09,False,2766.0,1.0,2.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
237325,DevSammy,1.542643e+09,True,2304.0,25.0,9.0,9.0,15.0,14.0
237326,Biguwuiscute,1.551404e+09,False,9142.0,100.0,100.0,100.0,0.0,0.0
237327,TayTai,1.439551e+09,False,349.0,0.0,0.0,0.0,0.0,0.0
237328,Empty_Sink_4597,1.603176e+09,False,2331.0,7.0,1.0,7.0,0.0,0.0


In [None]:
# survival_data['left_score'] = survival_data['left_score'] / survival_data['left_count']
# survival_data['right_score'] = survival_data['right_score'] / survival_data['right_count']
survival_data.to_pickle('/content/drive/MyDrive/controversy/controversy_data/unlabeled_data/survival_data.pickle')

In [None]:
all_users = survival_data
left = list(set(all_users[(all_users['left_count'] > all_users['right_count']) & (all_users['left_score'] > 1) & (all_users['left_score'] > all_users['right_score'])].author))
right = list(set(all_users[(all_users['left_count'] < all_users['right_count']) & (all_users['right_score'] > 1) & (all_users['left_score'] < all_users['right_score'])].author))
len(left), len(right)

(87192, 10145)

In [None]:
survival_data['is_left'] = survival_data['author'].isin(left).astype(int)
survival_data['is_right'] = survival_data['author'].isin(right).astype(int)
survival_data

Unnamed: 0,author,tenure_timestamp,is_mod,comment_karma,total_count,left_score,left_count,right_score,right_count,is_left,is_right
105132,IV-IVm-I,1.420902e+09,False,22359.0,93.0,86.0,86.0,41.0,5.0,1,0
105133,damejudyclench,1.548133e+09,False,3897.0,58.0,101.0,35.0,0.0,0.0,1,0
105134,aragorn841,1.388695e+09,False,254.0,4.0,-17.0,4.0,0.0,0.0,0,0
105136,stupidhoes,1.324225e+09,False,12180.0,8.0,18.0,8.0,0.0,0.0,1,0
105137,TimeBreakerBaba,1.548404e+09,False,2766.0,1.0,2.0,1.0,0.0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
237325,DevSammy,1.542643e+09,True,2304.0,25.0,9.0,9.0,15.0,14.0,0,1
237326,Biguwuiscute,1.551404e+09,False,9142.0,100.0,100.0,100.0,0.0,0.0,1,0
237327,TayTai,1.439551e+09,False,349.0,0.0,0.0,0.0,0.0,0.0,0,0
237328,Empty_Sink_4597,1.603176e+09,False,2331.0,7.0,1.0,7.0,0.0,0.0,0,0


In [None]:
survival_data.to_pickle('/content/drive/MyDrive/controversy/controversy_data/unlabeled_data/survival_data_more.pickle')