In [1]:
import pandas as pd
import numpy as np
import os
import gc
root_dir = os.path.abspath(os.curdir)
pred_dir_subs = os.path.dirname(root_dir)+"/SubmissionPredictions/"
pred_dir_coms = os.path.dirname(root_dir)+"/CommentPredictions/"
#Create directory to save experiment results
results_dir = os.path.dirname(root_dir)+"/Results/"
os.makedirs(results_dir, exist_ok=True)

In [2]:
df_dtype = {
        "Author": 'category',
        "Probability":'float',
        "Label":'string'}


def getRawCounts(collection,pred_dir):
    print(collection)
    for year in range(2006,2008):
        print(year)
        yearFile = pred_dir+str(year)+'.csv'

        output = results_dir+"rawToxicityCounts-"+collection+"-"+str(year)+".csv"
        with open(output,'a') as out:
            out.write("Username,Class1,Count1,Class2,Count2,Class3,Count3\n")
     
            fields =['Author','Probability','Label']

            df = pd.read_csv(yearFile, usecols=fields,dtype=df_dtype)
            label = df.Label.mode().values

            for user,df1 in df.groupby('Author'):
                HT = 0
                NT = 0
                ST = 0
                out.write(user+',')
                counter = df1.groupby(['Label'])
                if 'non_toxic' in counter.groups.keys():
                    NT = counter.get_group('non_toxic').count()['Probability']
                    NT1 = (NT/float(len(df1)))*100
                else:
                    NT1 = 0
                if 'highly_toxic' in counter.groups.keys():
                    HT = counter.get_group('highly_toxic').count()['Probability']
                    HT1 = (HT/float(len(df1)))*100
                else:
                    HT1 = 0
                if 'slightly_toxic' in counter.groups.keys():
                    ST = counter.get_group('slightly_toxic').count()['Probability']
                    ST1 = (ST/float(len(df1)))*100
                else:
                    ST1 = 0

                out.write("non_toxic"+','+str(NT)+',')

                out.write("highly_toxic"+','+str(HT)+',')

                out.write("slightly_toxic"+','+str(ST)+'\n')
            del df
            gc.collect()

def combineJudgments(collection):
    merged = pd.DataFrame()
    for year in range(2006,2008):
        print(year)  
        df = pd.read_csv(results_dir+"rawToxicityCounts-"+collection+"-"+str(year)+".csv")
        merged = merged.append(df,ignore_index=True)    
        del df
    myGroup = merged.groupby('Username',as_index=False).sum()
    myGroup['Class1'] = 'non_toxic'
    myGroup['Class2'] = 'highly_toxic'
    myGroup['Class3'] = 'slightly_toxic'
    myGroup = myGroup[['Username','Class1','Count1','Class2','Count2','Class3','Count3']]
    print(myGroup.head())
    print(len(myGroup))
    output = results_dir+collection+'Judgments-rawCounts.csv'
    myGroup.to_csv(output,index=False)
    del myGroup
    del merged
    gc.collect()

def computeQuartiles(collection):
    ddf = pd.read_csv(results_dir+collection+'Judgments-rawCounts.csv')
    ddf['totals'] = ddf['Count1']+ddf['Count2']+ddf['Count3']
    ddf['Toxicity'] = round(((ddf['Count2']+ddf['Count3'])/ddf['totals'])*100)
    print(ddf.head())

    ddf = ddf[['Username','Toxicity']]

    df = ddf[(ddf != 0).all(1)]
    print(df.describe())
    df1_size = df.groupby(pd.qcut(df['Toxicity'],4)).size()
    print(df1_size)
    totals = []
    df1 = df.groupby(pd.qcut(df['Toxicity'],4))
    for key, item in df1:
        # df1.get_group(key).to_csv(collection+"-"+str(key)+".csv",index=False)
        totals.append(len(df1.get_group(key)))
        print(df1.get_group(key), "\n\n")
    finSum = sum(totals)
    for i in totals:
        print(i,format((i/finSum)*100,'.2f'))

In [None]:
getRawCounts("Submission",pred_dir_subs)
combineJudgments("Submission")
computeQuartiles("Submission")
getRawCounts("Comment",pred_dir_coms)
combineJudgments("Comment")
computeQuartiles("Comment")

Submission
2006
2007
2006
2007
  Username     Class1  Count1        Class2  Count2          Class3  Count3
0   1Dunya  non_toxic       2  highly_toxic       0  slightly_toxic       0
1  3n7r0py  non_toxic      12  highly_toxic       0  slightly_toxic       0
2   60secs  non_toxic       5  highly_toxic       0  slightly_toxic       0
3   7thton  non_toxic       2  highly_toxic       0  slightly_toxic       0
4   9jack9  non_toxic     114  highly_toxic       3  slightly_toxic       7
844
  Username     Class1  Count1        Class2  Count2          Class3  Count3  \
0   1Dunya  non_toxic       2  highly_toxic       0  slightly_toxic       0   
1  3n7r0py  non_toxic      12  highly_toxic       0  slightly_toxic       0   
2   60secs  non_toxic       5  highly_toxic       0  slightly_toxic       0   
3   7thton  non_toxic       2  highly_toxic       0  slightly_toxic       0   
4   9jack9  non_toxic     114  highly_toxic       3  slightly_toxic       7   

   totals  Toxicity  
0       2   