In [183]:
import os
import pickle
import time

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from matplotlib import pyplot as plt
from torch import nn
from torch.utils.data import DataLoader, Dataset

In [187]:
#
# Execution parameters
#

FILES_PATH="preprocessed_datasets/"
DATASET_NAME="coronavirus_2021q1_all_preprocessed"
BATCH_SIZE=2**14
TRAIN_TEST_METHOD="controlled_both" # random, controlled_users, controlled_subreddits, controlled_both
MODEL_ARCHITECTURE="toxicity_simple" # toxicity_simple, toxicity_NCF, toxicity_BOW, toxicity_BERT_simple, toxicity_BERTwithembeddings
DROP_UNTESTED_USERS= False
TRAIN_WITH_GPU = True
BALANCE_TRAIN_USERS = True
TRAINING_GOAL="regression" #regression, classification

# Grid search parameters
learning_rates=[1e-3,1e-4,1e-5]
n_factors=[32,128,512]
l2_reg=[5e-4,1e-5,0]

epochs=100

device = torch.device("cuda" if torch.cuda.is_available() and TRAIN_WITH_GPU else "cpu")

In [189]:
#
# Read dataset and obtain the basic model inputs and outputs
#

df=pd.read_csv(f'{FILES_PATH}/{DATASET_NAME}_preprocessed_toxicity.csv',encoding='UTF_8')

#We roughly defined a (user,subreddit) interaction as toxic when the mean toxicity is >0
df=df.groupby(['author_id','subreddit_id'],as_index=False)['Toxicity'].mean()
if TRAINING_GOAL=="classification":
    df['Toxicity']=df['Toxicity'].apply(lambda x: 0 if x<0 else 1)

print(f"Toxic interactions: {(100*(df[df['Toxicity']>0].shape[0]/len(df['Toxicity']))):.2f}%") #Percentage of toxic interactions (not comments!)

#Primary inputs (author_id, subreddit_id) and outputs (toxicity)
x=df.drop(['Toxicity'],axis=1).to_numpy().astype(int)
y=df['Toxicity'].to_numpy()

n_users=df['author_id'].nunique()
n_subreddits=df['subreddit_id'].nunique()

Toxic interactions: 9.35%


In [190]:
#
# Data analysis and plots
#

if TRAINING_GOAL=="classification":

    plt.rcParams["figure.figsize"]=(3,3)

    #Plot histogram of Avg. Toxicity per user
    avg_toxicity_per_user=df.groupby(['author_id'],as_index=False)['Toxicity'].mean()
    avg_toxicity_per_user['Toxicity'].plot.hist()
    plt.show()

    #Obtain dataframe with (user_id,comment_count,mean_toxicity)
    no_comments_per_user=df.groupby(['author_id'],as_index=False).size()
    no_comments_per_user["mean_toxicity"]=avg_toxicity_per_user["Toxicity"]

    #** Obtain dataframe with (number_of_comments, avg toxicity for users with number_of_comments)
    avg_toxicity_per_interaction_count=no_comments_per_user.groupby(['size'],as_index=False)['mean_toxicity'].mean()

    #Obtain dataframe with (number_of_comments, number of users with number_of_comments comments)
    no_users_per_comment_count=no_comments_per_user.rename(columns={'size':'comment_count'}).groupby(['comment_count'],as_index=False).size()

    #Obtain regression for **
    m, b = np.polyfit(no_comments_per_user["size"].to_numpy(), avg_toxicity_per_user['Toxicity'].to_numpy(), 1)

    #Plot everything:

    #Left y-axis
    plt.plot(no_comments_per_user["size"].to_numpy(), avg_toxicity_per_user['Toxicity'].to_numpy(), 'o', alpha=0.005, c='black',label="Individual user data points\n(no. of interactions, avg. toxicity)")
    plt.plot(avg_toxicity_per_interaction_count["size"].to_numpy(), avg_toxicity_per_interaction_count['mean_toxicity'].to_numpy(), 'o', alpha=.8, c='red', label="Avg. toxicity of users with n interactions")
    plt.plot(no_comments_per_user["size"].to_numpy(), m*no_comments_per_user["size"].to_numpy() + b)

    plt.ylim((0,1))
    plt.xlim((0,25))

    plt.xlabel("(user,subreddit) interactions")
    plt.ylabel("Average interaction toxicity of user")

    plt.legend(bbox_to_anchor=(1.3, 1), loc="upper left")

    #Switch axis
    plt.twinx()

    #Right y-axis
    plt.plot(no_users_per_comment_count["comment_count"].to_numpy(),no_users_per_comment_count["size"].to_numpy(),color='green',label="No. of users per interaction count")

    plt.ylim((0,3000))

    plt.ylabel("No. of Users")

    plt.legend(bbox_to_anchor=(1.3, 0.7), loc="upper left")

    plt.show()

In [191]:
#
# Define and perform train/test split, generate datasets
#

def train_test_split(df):
    user_groups=df.groupby('author_id')
    subreddit_groups=df.groupby('subreddit_id')

    test=[]

    # Approach "random": If the user posts in more than 10 subreddits, use them for training set (10% of their interactions)
    if TRAIN_TEST_METHOD=="random":
        for _,group in user_groups:
            if group.shape[0]>=10:
                test+=(group.sample(n=int(group.shape[0]*0.15)).to_dict(orient="records"))

    # Approach "controlled": If they post in more than 10 subreddits AND have a
    elif TRAIN_TEST_METHOD=="controlled_users":
        for _,group in user_groups:
            if group.shape[0]>=10 and (group["Toxicity"]>0).sum()>1 and ((group["Toxicity"].shape[0]-(group["Toxicity"]>0).sum())>1):
                test += (group[group["Toxicity"]==1].sample(1).to_dict(orient="records"))
                test += (group[group["Toxicity"]==0].sample(1).to_dict(orient="records"))

    elif TRAIN_TEST_METHOD=="controlled_subreddits":
        for _,group in subreddit_groups:
            if group.shape[0]>=10 and (group["Toxicity"]>0).sum()>1 and ((group["Toxicity"].shape[0]-(group["Toxicity"]>0).sum())>1):
                test += (group[group["Toxicity"]>1].sample(1).to_dict(orient="records"))
                test += (group[group["Toxicity"]<=0].sample(1).to_dict(orient="records"))

    elif TRAIN_TEST_METHOD=="controlled_both":
        
        valid_users=[]
        valid_subreddits=[]
        for user,group in user_groups:
            if group.shape[0]>=10 and (group["Toxicity"]>0).sum()>2 and ((group["Toxicity"].shape[0]-(group["Toxicity"]>0).sum())>2):
                valid_users.append(user)


        for subreddit,group in subreddit_groups:
            if group.shape[0]>=10 and (group["Toxicity"]>0).sum()>2 and ((group["Toxicity"].shape[0]-(group["Toxicity"]>0).sum())>2):
                valid_subreddits.append(subreddit)
        
        print(f"Found {len(valid_users)} users  and {len(valid_subreddits)} that meet criteria")

        valid_rows = df[(df["author_id"].isin(valid_users)) & (df["subreddit_id"].isin(valid_subreddits))]

        print(f"Intersecting these users and subreddits, {valid_rows['author_id'].nunique()} and {valid_rows['subreddit_id'].nunique()} are preserved")

        test += (valid_rows[valid_rows["Toxicity"]>0].sample(valid_rows["author_id"].nunique()).to_dict(orient="records"))
        test += (valid_rows[valid_rows["Toxicity"]<=0].sample(valid_rows["author_id"].nunique()).to_dict(orient="records"))

    test = pd.DataFrame(test)
    print(f"Total test samples: {test.shape[0]}")

    train = pd.concat([df, test]).drop_duplicates(keep=False)
    
    # Unccomment for oversampling toxic interactions in train set
    # train = pd.concat([train,pd.concat([train[train["Toxicity"]==1]]*(round(toxic_labels_weight)-1))])
    # toxic_labels_weight=(len(train['Toxicity'])-train['Toxicity'].sum())/train['Toxicity'].sum()
    
    if DROP_UNTESTED_USERS or BALANCE_TRAIN_USERS:
        train=train[(train["author_id"].isin(test["author_id"])) & (train["subreddit_id"].isin(test["subreddit_id"]))]
    if BALANCE_TRAIN_USERS:
        newtrain=[]
        user_groups=train.groupby('author_id')
        for user,group in user_groups:
            positives = (group["Toxicity"]>0).sum()
            negatives = group["Toxicity"].shape[0]-(group["Toxicity"]>0).sum()
            newtrain += group[group["Toxicity"]>0].sample(min(positives,negatives)).to_dict(orient="records")
            newtrain += group[group["Toxicity"]<=0].sample(min(positives,negatives)).to_dict(orient="records")


        train = pd.DataFrame(newtrain)
        train = train.sample(train.shape[0]*10,replace=True)
    return train, test

#Split train and test sets

train,test=train_test_split(df) 

toxic_labels_weight=(len(train['Toxicity'])-(train["Toxicity"]>0).sum())/(train["Toxicity"]>0).sum()
print(f"Applying a weight of {toxic_labels_weight:.2f} for positive samples in training loss")

X_train=train.drop(['Toxicity'],axis=1).to_numpy().astype(int)
X_test=test.drop(['Toxicity'],axis=1).to_numpy().astype(int)

y_train=train['Toxicity'].to_numpy()
y_test=test['Toxicity'].to_numpy()

X_train, X_test = torch.Tensor(X_train).int().to(device), torch.Tensor(X_test).int().to(device)
y_train, y_test = torch.Tensor(y_train).float().to(device), torch.Tensor(y_test).float().to(device)

Found 1125 users  and 259 that meet criteria
Intersecting these users and subreddits, 1125 and 255 are preserved
Total test samples: 2250
Applying a weight of 1.00 for positive samples in training loss


In [192]:
#
# Load additional data depending on architecture
#


if MODEL_ARCHITECTURE=="toxicity_BOW":

    #Load users' and subreddits' Bag of Words
    user_bows=pickle.load(open("preprocessed_datasets/coronavirus_2021q1_all_preprocessed_USERS_BAG_OF_WORDS","rb"))
    subreddit_bows=pickle.load(open("preprocessed_datasets/coronavirus_2021q1_all_preprocessed_SUBREDDIT_BAG_OF_WORDS","rb"))

    #We're gonna work with binary vectors for now
    user_bows[user_bows>1]=1
    subreddit_bows[subreddit_bows>1]=1

    user_bows=torch.Tensor(user_bows).float().to(device)
    subreddit_bows=torch.Tensor(subreddit_bows).float().to(device)

if MODEL_ARCHITECTURE in ["toxicity_BERT_simple","toxicity_BERTwithembeddings"]:
    df_raw=pd.read_csv(f'{FILES_PATH}/{DATASET_NAME}_preprocessed_toxicity.csv',encoding='UTF_8')
    
    # train_combs = train[['author_id','subreddit_id']].apply(tuple,axis=1)
    # raw_combs = df_raw[['author_id','subreddit_id']].apply(tuple,axis=1)

    df_raw = df_raw[~(df_raw[['author_id','subreddit_id']].apply(tuple,axis=1).isin(test[['author_id','subreddit_id']].apply(tuple,axis=1)))]
    embeddings=pickle.load(open("BERT_EMBEDDINGS/MARCH_21",'rb'))
    
    print(n_users)
    print(n_subreddits)

    print(df_raw["author_id"].nunique(),df_raw["subreddit_id"].nunique())
    commentlists_user=df_raw.groupby('author_id')['comment_id'].apply(list).reset_index(name="comment_ids")
    bertavg_users=np.stack(commentlists_user["comment_ids"].apply(lambda x: np.average(embeddings[x,:],axis=0)).to_numpy(),axis=0)

    commentlists_subreddit=df_raw.groupby('subreddit_id')['comment_id'].apply(list).reset_index(name="comment_ids")
    bertavg_subreddits=np.stack(commentlists_subreddit["comment_ids"].apply(lambda x: np.average(embeddings[x,:],axis=0)).to_numpy(),axis=0)
    
    bertavg_users=torch.Tensor(bertavg_users).float().to(device)
    bertavg_subreddits=torch.Tensor(bertavg_subreddits).float().to(device)

    print(bertavg_subreddits.shape)
    print(bertavg_users.shape)
    

In [194]:
#
# Define the model architecture
#

class ToxicitySimple(nn.Module):
    def __init__(self,d):
        super(ToxicitySimple,self).__init__()

        #We only have one trainable layer depth (two Embeddings and two FC's)

        self.u = nn.Embedding(n_users,d)                #Embedding author_id -> n_factors
        self.m = nn.Embedding(n_subreddits,d)           #Embedding subreddit_id-> n_factors
        # self.u1 = nn.Linear(user_bows.shape[1],n_factors)       #FC        author bow -> n_factors
        # self.m1 = nn.Linear(subreddit_bows.shape[1],n_factors)  #FC        subreddit bow -> n_factors

        #Initialize weights
        self.u.weight.data.uniform_(-0.1, 0.1)
        self.m.weight.data.uniform_(-0.1, 0.1)
        # self.u1.weight.data.uniform_(-0.05, 0.05)
        # self.m1.weight.data.uniform_(-0.05, 0.05)

    def forward(self, x):
        users, subreddits = x[:,0] , x[:,1] #Get author_id and subreddit_id from input
        # ubows, sbows = user_bows[x[:,0].long()], subreddit_bows[x[:, 1].long()] #Get author bow and subreddit bow indirectly from input
        u,m = self.u(users), self.m(subreddits) #Embed author and subreddit

        #For simple model and BCEwithLogits
        return (u*m).sum(1)

#References: https://towardsdatascience.com/paper-review-neural-collaborative-filtering-explanation-implementation-ea3e031b7f96
class ToxicityNCF(nn.Module):
    def __init__(self,d):
        super(ToxicityNCF,self).__init__()

        #We only have one trainable layer depth (two Embeddings and two FC's)
        self.u_mf = nn.Embedding(n_users,d)                #Embedding author_id -> n_factors
        self.u_mlp = nn.Embedding(n_users,d)                #Embedding author_id -> n_factors
        self.m_mf = nn.Embedding(n_subreddits,d)           #Embedding subreddit_id-> n_factors
        self.m_mlp = nn.Embedding(n_subreddits,d)           #Embedding subreddit_id-> n_factors

        #Initialize weights
        self.u_mf.weight.data.uniform_(-0.5, 0.5)
        self.u_mlp.weight.data.uniform_(-0.5, 0.5)

        self.m_mf.weight.data.uniform_(-0.5, 0.5)
        self.m_mlp.weight.data.uniform_(-0.5, 0.5)

        self.fc_mf_1 = nn.Linear(d,d//2)
        self.fc_mf_2 = nn.Linear(d//2,d//4)

        self.fc_mf_1.weight.data.uniform_(-0.5, 0.5)
        self.fc_mf_2.weight.data.uniform_(-0.5, 0.5)

        self.fc_mlp_1 = nn.Linear(d*2,d)
        self.fc_mlp_2 = nn.Linear(d,d//2)
        self.fc_mlp_3 = nn.Linear(d//2,d//4)

        self.fc_mlp_1.weight.data.uniform_(-0.5, 0.5)
        self.fc_mlp_2.weight.data.uniform_(-0.5, 0.5)
        self.fc_mlp_3.weight.data.uniform_(-0.5, 0.5)

        self.neumf = nn.Linear(d//2,1)
        self.neumf.weight.data.uniform_(-0.5, 0.5)

        self.dropout=nn.Dropout(0.1)

    def forward(self, x):
        
        users, subreddits = x[:,0] , x[:,1] #Get author_id and subreddit_id from input
        # ubows, sbows = user_bows[x[:,0].long()], subreddit_bows[x[:, 1].long()] #Get author bow and subreddit bow indirectly from input
        u_mf, m_mf = self.u_mf(users), self.m_mf(subreddits) #Embed author and subreddit

        u_mlp, m_mlp = self.u_mlp(users), self.m_mlp(subreddits)

        mf = self.dropout(nn.functional.relu(self.fc_mf_1(u_mf*m_mf)))
        mf = self.dropout(nn.functional.relu(self.fc_mf_2(mf)))

        mlp = self.dropout(nn.functional.relu(self.fc_mlp_1(torch.cat((u_mlp,m_mlp),1))))
        mlp = self.dropout(nn.functional.relu(self.fc_mlp_2(mlp)))
        mlp = self.dropout(nn.functional.relu(self.fc_mlp_3(mlp)))

        neumf = self.neumf(torch.cat((mf,mlp),1))

        return neumf

class ToxicityBOW(nn.Module):
    def __init__(self,d):
        super(ToxicityBOW,self).__init__()

        #We only have one trainable layer depth (two Embeddings and two FC's)

        self.u = nn.Embedding(n_users,d)                #Embedding author_id -> n_factors
        self.m = nn.Embedding(n_subreddits,d)           #Embedding subreddit_id-> n_factors
        self.u1 = nn.Linear(user_bows.shape[1],d)       #FC        author bow -> n_factors
        self.m1 = nn.Linear(subreddit_bows.shape[1],d)  #FC        subreddit bow -> n_factors

        #Initialize weights
        self.u.weight.data.uniform_(-0.1, 0.1)
        self.m.weight.data.uniform_(-0.1, 0.1)
        self.u1.weight.data.uniform_(-0.1, 0.1)
        self.m1.weight.data.uniform_(-0.1, 0.1)

    def forward(self, x):
        users, subreddits = x[:,0] , x[:,1] #Get author_id and subreddit_id from input
        ubows, sbows = user_bows[x[:,0].long()], subreddit_bows[x[:, 1].long()] #Get author bow and subreddit bow indirectly from input
        u,m = self.u(users), self.m(subreddits) #Embed author and subreddit
        u1, m1 = self.u1(ubows), self.m1(sbows) #Reduce dimensionality of author bow and subreddit bow
        
        #Concat u with u1, and m with m1. Compute the dot product of the resulting vectors, and pass the value through a sigmoid.

        #If using the model with BOW and BCE
        return (torch.cat((u, u1), 1) * torch.cat((m, m1), 1)).sum(1).view(-1, 1)

class ToxicityBERTSimple(nn.Module):
    def __init__(self,d):
        super(ToxicityBERTSimple,self).__init__()

        #We only have one trainable layer depth (two Embeddings and two FC's)

        self.u1 = nn.Linear(bertavg_users.shape[1],d)       #FC        author bow -> n_factors
        self.m1 = nn.Linear(bertavg_subreddits.shape[1],d)  #FC        subreddit bow -> n_factors

        self.u = nn.Embedding(n_users,d)                #Embedding author_id -> n_factors
        self.m = nn.Embedding(n_subreddits,d)           #Embedding subreddit_id-> n_factors
        
        self.fc1 = nn.Linear(2*d, d//2)
        self.fc2 = nn.Linear(d//2, 1)

        #Initialize weights
        # self.u1.weight.data.uniform_(-0.1, 0.1)
        # self.m1.weight.data.uniform_(-0.1, 0.1)

    def forward(self, x):
        ubert, sbert = bertavg_users[x[:,0].long()], bertavg_subreddits[x[:, 1].long()] #Get author bow and subreddit bow indirectly from input
        u1, m1 = self.u1(ubert), self.m1(sbert) #Reduce dimensionality of author bow and subreddit bow

        conc=torch.cat((u1,m1),1)
        conc=F.dropout(F.relu(conc),0.1)
        conc = F.dropout(F.relu(self.fc1(conc)),0.1)
        conc = self.fc2(conc)

        return conc
        #Concat u with u1, and m with m1. Compute the dot product of the resulting vectors, and pass the value through a sigmoid.

        #If using the model with BOW and BCE
        # return F.cosine_similarity(u1,m1)
class ToxicityBERTWithEmbeddings(nn.Module):
    def __init__(self,d):
        super(ToxicityBERTWithEmbeddings,self).__init__()

        #We only have one trainable layer depth (two Embeddings and two FC's)

        self.u1 = nn.Linear(bertavg_users.shape[1],d)       #FC        author bow -> n_factors
        self.m1 = nn.Linear(bertavg_subreddits.shape[1],d)  #FC        subreddit bow -> n_factors

        self.u = nn.Embedding(n_users,d)                #Embedding author_id -> n_factors
        self.m = nn.Embedding(n_subreddits,d)           #Embedding subreddit_id-> n_factors
        
        self.fc1 = nn.Linear(2*d, d//2)

        self.fc_11 = nn.Linear(d, d//2)

        self.final = nn.Linear(d,1)
        #Initialize weights
        # self.u1.weight.data.uniform_(-0.1, 0.1)
        # self.m1.weight.data.uniform_(-0.1, 0.1)

    def forward(self, x):
        ubert, sbert = bertavg_users[x[:,0].long()], bertavg_subreddits[x[:, 1].long()] #Get author bow and subreddit bow indirectly from input
        u1, m1 = self.u1(ubert), self.m1(sbert) #Reduce dimensionality of author bow and subreddit bow


        conc=torch.cat((u1,m1),1)
        conc=F.dropout(F.relu(conc),0.1)
        conc = F.dropout(F.relu(self.fc1(conc)),0.1)
        
        u, m = self.u(x[:,0]), self.m(x[:, 1])
        mul = F.dropout(F.relu(self.fc_11(u*m)),0.1)

        conc = self.final(torch.cat((conc,mul),1))



        return conc
        #Concat u with u1, and m with m1. Compute the dot product of the resulting vectors, and pass the value through a sigmoid.

        #If using the model with BOW and BCE
        # return F.cosine_similarity(u1,m1)


In [166]:
#
# One-class SVM tests
#

# from sklearn.svm import OneClassSVM

# x_trainusers_bert = bertavg_users[X_train.cpu().numpy()[:,0]]
# x_trainsubs_bert = bertavg_subreddits[X_train.cpu().numpy()[:,1]]

# svm_train_set = np.concatenate((x_trainusers_bert.cpu().numpy(),x_trainsubs_bert.cpu().numpy()),axis=1)



In [167]:
from sklearn.feature_selection import SelectPercentile, mutual_info_regression

# print(svm_train_set.shape)
# #Ref: https://medium.com/analytics-vidhya/feature-selection-using-scikit-learn-5b4362e0c19b
# #Ref: https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172
# selector = SelectPercentile(mutual_info_regression, percentile=20)
# selector.fit(svm_train_set,y_train.cpu().numpy())


In [168]:

# #Reference: https://www.datatechnotes.com/2020/04/anomaly-detection-with-one-class-svm.html
# selected_chars = np.squeeze(svm_train_set[:,np.where(selector.get_support()==True)])
# print(selected_chars.shape)

# svm = OneClassSVM(kernel='rbf', gamma='scale', nu=1/toxic_labels_weight, verbose=True, cache_size=1600)
# svm.fit(selected_chars)

In [169]:
# predictions = svm.predict(selected_chars)

In [170]:
# predictions[predictions==1]=0
# predictions[predictions==-1]=1

# labels=y_train.cpu().numpy()

# tp = np.sum(np.logical_and(predictions,labels))
# tn = np.sum(np.logical_and(np.logical_not(predictions),np.logical_not(labels)))
# fn = np.sum(np.logical_and(np.logical_not(predictions),labels))
# fp = np.sum(np.logical_and(predictions,np.logical_not(labels)))

# print(tp, fp, fn, tn)

# print(f"Accuracy: {(tn+tp)/(tn+tp+fn+fp)}")

In [171]:
# x_testusers_bert = bertavg_users[X_test.cpu().numpy()[:,0]]
# x_testsubs_bert = bertavg_subreddits[X_test.cpu().numpy()[:,1]]

# svm_test_set = (x_testusers_bert.cpu()*x_testsubs_bert.cpu()).numpy()
# selected_chars_test = np.squeeze(svm_test_set[:,np.where(selector.get_support()==True)])

# predictions_test = svm.predict(selected_chars_test)


In [172]:
# predictions=predictions_test.copy()


# print(predictions.shape,labels.shape)
# predictions[predictions==1]=0
# predictions[predictions==-1]=1

# labels=y_test.cpu().numpy()

# tp = np.sum(np.logical_and(predictions,labels))
# tn = np.sum(np.logical_and(np.logical_not(predictions),np.logical_not(labels)))
# fn = np.sum(np.logical_and(np.logical_not(predictions),labels))
# fp = np.sum(np.logical_and(predictions,np.logical_not(labels)))

# print(tp, fp, fn, tn)

# print(f"Accuracy: {(tn+tp)/(tn+tp+fn+fp)}")

In [197]:
#
# Create the Dataset and Dataloader objects for train and test sets
#

class ToxicityDataset(Dataset):
    def __init__(self):
        self.x=X_train
        self.y=y_train
        self.n_samples=X_train.size(0)
    def __getitem__(self, index):
        return self.x[index],self.y[index]
    def __len__(self):
        return self.n_samples

class TestDataset(Dataset):
    def __init__(self):
        self.x=X_test
        self.y=y_test
        self.n_samples=X_test.size(0)
    def __getitem__(self, index):
        return self.x[index],self.y[index]
    def __len__(self):
        return self.n_samples

train_data=ToxicityDataset()
dataloader=DataLoader(dataset=train_data,batch_size=BATCH_SIZE,shuffle=True)

test_data=TestDataset()
test_dataloader=DataLoader(dataset=test_data,batch_size=BATCH_SIZE,shuffle=True)

print(f"Toxic interactions (positive samples) in train test: {np.average((y_train>0).cpu())*100:.2f}%")
print(f"Toxic interactions (positive samples) in test test: {np.average((y_test>0).cpu())*100:.2f}%")


Toxic interactions (positive samples) in train test: 49.99%
Toxic interactions (positive samples) in test test: 50.00%


In [200]:
#
# Define the training cycle for the model
#
print(f"CUDA availability: {torch.cuda.is_available()}")
print(f"Using pytorch version {torch.__version__}, {torch.version.cuda}")
def train_toxicity_model(n_factors=64,learning_rate=1e-3,l2_reg=1e-5,epochs=10):
    print(f"Current parameters are d:{n_factors} | L.Rate:{learning_rate} | L2 Reg:{l2_reg}")
    
    losses=[]
    losses_test=[]
    accuracies=[]
    accuracies_test=[]
    TPRs=[]
    TPRs_test=[]
    
    Iterations=[]

    #Initialize model
    if MODEL_ARCHITECTURE=="toxicity_simple":
        model = ToxicitySimple(n_factors).to(device)
    elif MODEL_ARCHITECTURE=="toxicity_NCF":
        model = ToxicityNCF(n_factors).to(device)
    elif MODEL_ARCHITECTURE=="toxicity_BOW":
        model = ToxicityBOW(n_factors).to(device)
    elif MODEL_ARCHITECTURE=="toxicity_BERT_simple":
        model = ToxicityBERTSimple(n_factors).to(device)
    elif MODEL_ARCHITECTURE=="toxicity_BERTwithembeddings":
        model = ToxicityBERTWithEmbeddings(n_factors).to(device)
    #Model configuration
    optimizer=torch.optim.Adam(model.parameters(), learning_rate, weight_decay=l2_reg) #Weight_decay acts as L2 regularization apparently

    print("EPOCH\tLOSS_TRAIN\tLOSS_TEST\tACC_TRAIN\tACC_TEST\tTP\tFP\tFN\tTN\tTPR_TRAIN\tTPR_TEST")
    # for epoch in tqdm(range(int(epochs)), desc='Training Epochs'):
    for epoch in range(int(epochs)):

        model.train()

        loss_train=0
        
        fn=0
        fp=0
        tn=0
        tp=0

        #Iterate training over train batches
        for i,(inputs,labels) in enumerate(dataloader):
            optimizer.zero_grad()  # Setting our stored gradients equal to zero
            outputs = torch.squeeze(model(inputs))

            #References: https://stackoverflow.com/questions/71462326/pytorch-bcewithlogitsloss-calculating-pos-weight
            # https://discuss.pytorch.org/t/bceloss-vs-bcewithlogitsloss/33586 (apparently more numerically stable than BCELoss)

            if TRAINING_GOAL=="classification":
                criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([toxic_labels_weight]).to(device))
            elif TRAINING_GOAL=="regression":
                criterion = torch.nn.MSELoss()

            loss = criterion(outputs, labels)
            loss_train += loss.item()*inputs.size(0)

            # print(loss)
            
            

            loss.backward()  # Computes the gradient of the given tensor w.r.t. the weights/bias

            optimizer.step()  # Updates weights and biases with the optimizer (Adam)

            #https://discuss.pytorch.org/t/bcewithlogitsloss-and-model-accuracy-calculation/59293
            predicted_train = ((outputs.cpu() > 0.0)).float().detach().numpy()
            
            labels_train = ((labels.cpu() > 0.0)).detach().cpu().numpy()


            tp += np.sum(np.logical_and(predicted_train,labels_train))
            tn += np.sum(np.logical_and(np.logical_not(predicted_train),np.logical_not(labels_train)))
            fn += np.sum(np.logical_and(np.logical_not(predicted_train),labels_train))
            fp += np.sum(np.logical_and(predicted_train,np.logical_not(labels_train)))

        TPRs.append(100*tp/(tp+fn))
                
        accuracy = 100 * (tp+tn) / y_train.size(0)
        
        loss_train = loss_train/y_train.size(0)
            
        losses.append(loss_train)
        accuracies.append(accuracy)
    
           
        with torch.no_grad():
            # Compute metrics for test dataset
            model.eval()
            loss_test=0

            fn=0
            fp=0
            tn=0
            tp=0

            for j, (test_inputs, test_labels) in enumerate(test_dataloader):
                
                outputs_test = torch.squeeze(model(test_inputs))

                if TRAINING_GOAL=="classification":
                    if TRAIN_TEST_METHOD=="random":
                        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([toxic_labels_weight]).to(device))
                    elif TRAIN_TEST_METHOD in ["controlled_users","controlled_subreddits","controlled_both"]:
                        criterion = torch.nn.BCEWithLogitsLoss()
                elif TRAINING_GOAL=="regression":
                        criterion = torch.nn.MSELoss()

                loss_test += criterion(outputs_test, test_labels).item()*test_inputs.size(0)

                predicted_test = ((outputs_test.detach().cpu() > 0.0)).numpy()

                labels_test = ((test_labels.detach().cpu() > 0.0)).numpy()

                tp += np.sum(np.logical_and(predicted_test,labels_test))
                tn += np.sum(np.logical_and(np.logical_not(predicted_test),np.logical_not(labels_test)))
                fn += np.sum(np.logical_and(np.logical_not(predicted_test),labels_test))
                fp += np.sum(np.logical_and(predicted_test,np.logical_not(labels_test)))

            accuracy_test = 100 * (tn+tp) / y_test.size(0)

            loss_test = loss_test/y_test.size(0)

            losses_test.append(loss_test)
            accuracies_test.append(accuracy_test)

            # Calculating the loss and accuracy for the train dataset.

            Iterations.append(iter)

            TPRs_test.append(100*tp/(tp+fn))

            # avg_1_train=np.average(torch.squeeze(outputs).cpu()[np.where(labels_train==1)])
            # avg_1_test=np.average(outputs_test[np.where(labels_test==1)])
            
            print(f"{epoch}\t{loss_train:.7f}\t{loss_test:.7f}\t{accuracy:.2f}\t\t{accuracy_test:.2f}\t\t{tp}\t{fp}\t{fn}\t{tn}\t{TPRs[-1]:.2f}\t\t{TPRs_test[-1]:.2f}",end="\r")
    
    print(f"{epoch}\t{loss_train:.7f}\t{loss_test:.7f}\t{accuracy:.2f}\t\t{accuracy_test:.2f}\t\t{tp}\t{fp}\t{fn}\t{tn}\t{TPRs[-1]:.2f}\t{TPRs_test[-1]:.2f}")
    return losses,losses_test,accuracies,accuracies_test, TPRs, TPRs_test



CUDA availability: True
Using pytorch version 1.12.1+cu113, 11.3


In [201]:
#
# Perform grid search
#

# Figures are saved in pdf format in a folder specific to configuration. One figure is created 
# per each *d* used, with one subfigure per each (lr, L2 reg) combination

directory_path = f"grid_search/{MODEL_ARCHITECTURE}_split{TRAIN_TEST_METHOD}_balancetrain{BALANCE_TRAIN_USERS}_{time.time()}"
print(directory_path)
os.makedirs(directory_path,exist_ok=True)

for d in n_factors:
    i=1

    plt.figure(figsize=(20,(20*min(len(learning_rates),len(l2_reg)))/max(len(learning_rates),len(l2_reg))))

    for lr in learning_rates:
        for reg in l2_reg:
            losses,losses_test,accuracies,accuracies_test,TPRs,TPRs_test = train_toxicity_model(n_factors=d,learning_rate=lr,l2_reg=reg,epochs=epochs)

            #Plot current training interation:
            plt.subplot(len(learning_rates),len(l2_reg),i)
            plt.title(f"d={d} | lr={lr} | l2-reg={reg}",fontdict={'fontsize': 12})

            plt.xticks(np.arange(0,epochs+1,20),fontsize=12)
            plt.yticks(np.arange(0,5,0.5),fontsize=12)

            plt.ylim(0,5)
            
            plt.plot(np.arange(0,epochs,1),losses, color="red",alpha=.25,label="Train Loss") #Train loss evolution
            plt.plot(np.arange(0,epochs,1),losses_test, color="blue",alpha=.25,label="Test Loss") #Test loss evolution

            if i==1: plt.legend(loc="upper left")

            plt.twinx() #Swap axis

            plt.yticks(np.arange(30,101,10),fontsize=12)

            plt.ylim(30,100)

            plt.plot(np.arange(0,epochs,1),accuracies, color="red", label="Train Acc")         #Train acc evolution
            plt.plot(np.arange(0,epochs,1),accuracies_test, color="blue", label="Test Acc")   #Test acc evolution

            plt.plot(np.arange(0,epochs,1),TPRs, '--', color="red", label="Train TPR (%)" , alpha=.3)   # Train TPR (%) evolution
            plt.plot(np.arange(0,epochs,1),TPRs_test, '--', color="blue", label="Test TPR (%)", alpha=.3)   #Test TPR (%) evolution

            if i==1: plt.legend(loc="center left")
            i+=1

    plt.tight_layout()
    plt.savefig(f"{directory_path}/d_{d}.pdf")

grid_search/toxicity_simple_splitcontrolled_both_balancetrainTrue_1666218970.7809894
Current parameters are d:32 | L.Rate:0.001 | L2 Reg:0.0005
EPOCH	LOSS_TRAIN	LOSS_TEST	ACC_TRAIN	ACC_TEST	TP	FP	FN	TN	TPR_TRAIN	TPR_TEST
77	6.6244010	18.5400620	91.82		49.47		578	590	547	535	88.26		51.383

KeyboardInterrupt: 

<Figure size 2000x2000 with 0 Axes>