In [6]:
import pandas as pd
import numpy as np
import os, sys
import warnings
import random

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

# f = os.path.dirname(__file__)
sys.path.append(os.path.join(os.getcwd(), "../.."))
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, auc, roc_curve, \
    balanced_accuracy_score, precision_recall_curve, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from detection.detection_utils.factory import create_dir_if_missing
from config.detection_config import user_level_execution_config, user_level_conf, post_level_execution_config

sns.set(rc={'figure.figsize': (10, 10)}, font_scale=1.4)
from scipy.optimize import minimize
from utils.my_timeit import timeit
from utils.general import init_log

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = optuna.samplers.TPESampler(**optuna.samplers.TPESampler.hyperopt_parameters())

logger = init_log("user_level_simple_models")

def expect_f1(y_true, y_prob, thres):
    idxs = np.where(y_prob >= thres)[0]
    tp = y_prob[idxs].sum()
    fp = len(idxs) - tp
    idxs = np.where(y_prob < thres)[0]
    fn = y_prob[idxs].sum()
    return 2*tp / (2*tp + fp + fn)

def optimal_threshold(y_true, y_prob):
    y_prob = np.sort(y_prob)[::-1]
    f1s = [expect_f1(y_true, y_prob, p) for p in y_prob]
    thres = y_prob[np.argmax(f1s)]
    return thres #, f1s 

def get_hs_count(current_preds, threshold=0.5):
    return len(current_preds[current_preds >= threshold])

os.chdir('/sise/home/tommarz/hate_speech_detection/')
os.getcwd()

'/sise/home/tommarz/hate_speech_detection'

In [9]:
dataset = user_level_execution_config["inference_data"]
logger.info(f"executing dataset {dataset}...")
model_name = post_level_execution_config["kwargs"]["model_name"] # new_bert_fine_tuning
user2pred = pd.read_parquet(f"detection/outputs/{dataset}/{model_name}/user_level/split_by_posts/no_text/")
user2pred['user_id'] = user2pred['user_id'].astype(int)
user2label_path = user_level_conf[dataset]["data_path"]
sep = ","
if user2label_path.endswith("tsv"):
    sep = "\t"
y = pd.read_csv(user2label_path, sep=sep, index_col=[0]).squeeze()
# user2pred = user2pred[user2pred['user_id'].isin(labeled_users.index)]
X = user2pred[user2pred['user_id'].isin(y.index)]
seed = random.randrange(2 ** 32)
# seed = 2334642105 #42 #338761188

predictions_output_path = os.path.join(post_level_execution_config["evaluation"]["output_path"], 'predictions.tsv')
predictions_df = pd.read_csv(predictions_output_path, sep='\t')
y_true = predictions_df['y_true']
y_prob = predictions_df['y_score']
y_pred = predictions_df['y_pred']

print("Seed is:", seed)
# fixed_threshold_num_of_posts(user2pred, labeled_users, output_path, dataset, test_ratio=0.2, random_state=seed)
# relational_threshold(user2pred, labeled_users, output_path, dataset, test_ratio=0.2, random_state=seed)
# dynamic_threshold_hs_score(user2pred, labeled_users, output_path, test_ratio=0.2, random_state=seed)

[32m2023-02-13 00:39:22,830 - INFO     - user_level_simple_models - executing dataset gab...[0m
Seed is: 3238357237


# Fixed Threshold Method

In [19]:
def fixed_threshold_method(X: pd.DataFrame, y: pd.Series, post_threshold=0.5, test_ratio=0.2, random_state=None, min_post_th=1, max_post_th=300):
    
    y_train, y_test = train_test_split(y, test_size=0.2, random_state=random_state, stratify=y)
    X_train = X[X['user_id'].isin(y_train.index)]
    X_test = X[X['user_id'].isin(y_test.index)]
    print(f'Train Percent HS Users: {y_train.mean()}')
    print(f'Test Percent HS Users: {y_test.mean()}')
    
    args = [post_threshold]
    train_hs_count_df = X_train.groupby('user_id').predictions.agg(get_hs_count, *args)
    min_num_of_posts_thresholds = range(max(min_post_th, train_hs_count_df.min()), min(max_post_th, train_hs_count_df.max())+1)
    
    train_preds = np.expand_dims(train_hs_count_df, axis=1) >= min_num_of_posts_thresholds
    train_f1_scores = [f1_score(y_train, p) for p in train_preds.T]
    best_f1_train, best_th = np.max(train_f1_scores), min_num_of_posts_thresholds[np.argmax(train_f1_scores)]
    
    test_hs_count_df = X_test.groupby('user_id').predictions.agg(get_hs_count, *args)
    test_preds = test_hs_count_df >= best_th
    test_f1_score = f1_score(y_test, test_preds)
    
    return best_th, best_f1_train, test_f1_score

In [20]:
post_threshold = optimal_threshold(y_true, y_prob)
fixed_threshold_method(X, y, post_threshold=post_threshold, test_ratio=0.2)

Train Percent HS Users: 0.2475
Test Percent HS Users: 0.25


(8, 0.4016309887869521, 0.388663967611336)

# Relational Threshold Method

In [29]:
train_df

Unnamed: 0,user_id,predictions
0,35105,0.269087
1,35105,0.031074
2,35105,0.769190
3,35105,0.026870
4,35105,0.068792
...,...,...
19840422,123832,0.189174
19840423,123832,0.299337
19840424,123832,0.504460
19840425,123832,0.231645


In [31]:
filtered_mentions_df

Unnamed: 0,source,dest,weight
0,1175,4735,90
1,1175,491,236
2,1175,31,117
3,1175,1,118
4,1175,341,79
...,...,...,...
68002,373817,8367,5
68003,373817,19632,12
68004,373817,276224,13
68005,379860,36481,8


In [33]:
user2pred.groupby('user_id').predictions.agg(get_hs_count, *args).rename('hs_count')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd49c3cf370>

In [57]:
user_hs_count = user2pred.query('`user_id` in @y.index').groupby('user_id').predictions.agg(get_hs_count, *args).rename('hs_count')
mentions_hs_count = user2pred.query('`user_id` in @filtered_mentions_df.source').groupby('user_id').predictions.agg(get_hs_count, *args).rename('hs_count').reset_index()
mentioned_by_hs_count =  user2pred.query('`user_id` in @filtered_mentions_df.dest').groupby('user_id').predictions.agg(get_hs_count, *args).rename('hs_count').reset_index()

In [53]:
filtered_mentions_hs_count_df = pd.merge(
    pd.merge(filtered_mentions_df, mentions_hs_count, left_on='source', right_on='user_id', how='left'),
    mentioned_by_hs_count, left_on='dest', right_on='user_id', how='left', suffixes=('_source', '_dest')
).fillna(0).astype(int).drop(columns=['source', 'dest', 'weight'])
filtered_mentions_hs_count_df

Unnamed: 0,user_id_source,hs_count_source,user_id_dest,hs_count_dest
0,1175,1366,4735,473
1,1175,1366,491,3092
2,1175,1366,31,1319
3,1175,1366,1,188
4,1175,1366,341,569
...,...,...,...,...
68002,373817,55,8367,3939
68003,373817,55,19632,1696
68004,373817,55,276224,224
68005,379860,1,36481,161


In [71]:
network_dir = f"hate_networks/outputs/{dataset.split('_')[0]}_networks/network_data/"
edges_dir = os.path.join(network_dir, "edges")
mentions_df = pd.read_csv(os.path.join(edges_dir, "data_users_mention_edges_df.tsv"), sep='\t')
retweets_df = pd.read_csv(os.path.join(edges_dir, "data_users_retweet_edges_df.tsv"), sep='\t')

In [72]:
def relational_threshold_method(X: pd.DataFrame, y: pd.DataFrame, min_mention_threshold=3, test_ratio=0.2, post_threshold=0.5, random_state=None):
    """
    Here we consider the assumption that relation to followers/followees effect the users' behaviour.
    For each user - get his average HS score, and the average HS scores of his followers and followees.
    then search for the optimal relational threshold to yield the best f1-score.
    This threshold will be combined from a self-TH + followers-TH + followees-TH.

    :param user2pred:
    :param labeled_users:
    :return:
    """

    min_post_th, max_post_th = 1, 300

    y_train, y_test = train_test_split(y, test_size=test_ratio, random_state=random_state, stratify=y)
    train_df = X[X['user_id'].isin(y_train.index)]
    test_df = X[X['user_id'].isin(y_test.index)]
    # keep only mentions above the minimal threshold
    filtered_mentions_df = mentions_df[mentions_df["weight"] >= min_mention_threshold].reset_index(drop=True)
    mentions_dict = {}  # users mentioned by the observed user
    mentioned_by_dict = {}  # users mentioning the observed user
    # mentions_df = mentions_df[(mentions_df['source'].isin(y.index)) | (mentions_df['dest'].isin(y.index))]
    # retweets_df = retweets_df[(retweets_df['source'].isin(y.index)) | (retweets_df['dest'].isin(y.index))]
    # mentions_dict = filtered_mentions_df.groupby('source')['dest'].apply(list) #.to_dict()
    # mentioned_by_dict = filtered_mentions_df.groupby('dest')['source'].apply(list) #.to_dict()

    args = [post_threshold]
    user_hs_count = user2pred.query('`user_id` in @y.index').groupby('user_id').predictions.agg(get_hs_count, *args).rename('hs_count')
    mentions_hs_count = user2pred.query('`user_id` in @filtered_mentions_df.source').groupby('user_id').predictions.agg(get_hs_count, *args).rename('hs_count').reset_index()
    mentioned_by_hs_count =  user2pred.query('`user_id` in @filtered_mentions_df.dest').groupby('user_id').predictions.agg(get_hs_count, *args).rename('hs_count').reset_index()

    filtered_mentions_hs_count_df = pd.merge(
        pd.merge(filtered_mentions_df, mentions_hs_count, left_on='source', right_on='user_id', how='left'),
        mentioned_by_hs_count, left_on='dest', right_on='user_id', how='left', suffixes=('_source', '_dest')
    ).fillna(0).astype(int).drop(columns=['source', 'dest', 'weight'])
    # filtered_mentions_hs_count_df

    following_hs_df = filtered_mentions_hs_count_df.groupby('user_id_source').agg({'hs_count_dest': ['mean', 'count', 'median']})
    following_hs_df.columns = [f'following_hs_{x[1]}' for x in following_hs_df.columns.to_flat_index()]
    followers_hs_df = filtered_mentions_hs_count_df.groupby('user_id_dest').agg({'hs_count_source': ['mean', 'count', 'median']})
    followers_hs_df.columns = [f'followers_hs_{x[1]}' for x in followers_hs_df.columns.to_flat_index()]

    followees_mean_hs_count_df =  filtered_mentions_hs_count_df.groupby('user_id_source')['hs_count_dest'].mean().rename('following_mean_hs_count')
    followers_mean_hs_count_df = filtered_mentions_hs_count_df.groupby('user_id_dest')['hs_count_source'].mean().rename('followers_mean_hs_count')

    user_hs_count_followees_followers_mean_hs_count = pd.merge(
        pd.merge(
            user_hs_count.rename('hs_count'), following_hs_df, left_index=True, right_index=True, how='left'
        ), followers_hs_df, left_index=True, right_index=True, how='left'
    ).fillna(0)#.astype(int) #.sum(axis=1)
    # user_hs_count_followees_followers_mean_hs_count

    # user_hs_count_followees_followers_mean_hs_count.sum(axis=1).mean().astype(int)

    cols = ['hs_count', 'following_hs_mean', 'followers_hs_mean']
    X_train = user_hs_count_followees_followers_mean_hs_count.loc[y_train.index, cols]
    X_test = user_hs_count_followees_followers_mean_hs_count.loc[y_test.index, cols]

    def get_relational_model_preds(X, y, self_weight, followers_weight, following_weight, threshold):
        preds = np.dot(X, [self_weight, followers_weight, following_weight]) >= threshold
        return f1_score(y, preds)

    def objective(trial, X, y):
        self_weight = trial.suggest_float('self_weight', 0, 1)
        # followers_weight = trial.suggest_discrete_uniform('followers_weight', 0, 1, 0.05)
        followers_weight = trial.suggest_float('followers_weight', 0, 1)
        following_weight = trial.suggest_float('following_weight', 0, 1)
        # threshold = trial.suggest_float('threshold', 1, X_train.sum(axis=1).mean().astype(int))
        threshold = trial.suggest_float('threshold', 1, 300)
        return get_relational_model_preds(X, y, self_weight, followers_weight, following_weight, threshold)

    study = optuna.create_study(direction="maximize", sampler=sampler)  # Create a new study.
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=1000, show_progress_bar=True)

    test_f1 = get_relational_model_preds(X_test, y_test, **study.best_params)
    best_f1 = study.best_value
    print(study.best_params)
    print( best_f1, test_f1)

In [81]:
post_threshold = optimal_threshold(y_true, y_prob)
relational_threshold_method(X, y, post_threshold=0.5, min_mention_threshold=3)

100%|██████████| 1000/1000 [00:20<00:00, 49.35it/s]

{'self_weight': 0.9413367253455346, 'followers_weight': 0.5802410616285082, 'following_weight': 0.0018075924492777506, 'threshold': 10.743408265861822}
0.44579780755176607 0.4307692307692308





# Dynamic Threshold

In [79]:
def dynamic_threshold_method(X: pd.DataFrame, y: pd.DataFrame, test_ratio=0.2, random_state=None, proba_threshold=0.5):

    hs_count_and_avg_score_per_user = X.groupby('user_id').agg(
        avg_hs_score=("predictions", "mean"),
        hs_count=("predictions", lambda p: get_hs_count(p, proba_threshold)))

    y_train, y_test = train_test_split(y, test_size=test_ratio, random_state=random_state, stratify=y)
    X_train = hs_count_and_avg_score_per_user.loc[y_train.index]
    X_test = hs_count_and_avg_score_per_user.loc[y_test.index]
    print(f'Train Percent HS Users: {y_train.mean()}')
    print(f'Test Percent HS Users: {y_test.mean()}')

    def calc_soft_threshold(arr, lower_bound, higher_bound, low_th, medium_th, high_th):
        return arr[:, 1] >= np.where(arr[:, 0] < lower_bound, high_th, np.where(arr[:, 0] < higher_bound, medium_th, low_th))

    def objective(trial):
        lower_bound = trial.suggest_float("lower_bound", 0.01, 0.15)
        higher_bound = trial.suggest_float("higher_bound", lower_bound + 0.01, 0.3)
        low_th = trial.suggest_int("low_th", 1, np.percentile(X_train['hs_count'].values, 20).astype(int)+1)
        medium_th = trial.suggest_int("medium_th", low_th+1, np.percentile(X_train['hs_count'].values, 40).astype(int)+1)
        high_th = trial.suggest_int("high_th", medium_th + 1, np.percentile(X_train['hs_count'].values, 60).astype(int)+1)

        c0 = float(0.01 + lower_bound - higher_bound)
        c1 = float(1 + medium_th - high_th)
        c2 = float(1 + low_th - medium_th)

        # Store the constraints as user attributes so that they can be restored after optimization.
        trial.set_user_attr("constraint", (c0, c1, c2))

        y_pred = calc_soft_threshold(X_train.values, lower_bound, higher_bound, low_th, medium_th, high_th)

        f1 = f1_score(y_train, y_pred)
        return f1

    def constraints(trial):
        return trial.user_attrs["constraint"]

    sampler = optuna.samplers.NSGAIISampler(
        constraints_func=constraints
    )
    study = optuna.create_study(
        direction='maximize',
        sampler=sampler,
    )
    study.optimize(objective, n_trials=1000, show_progress_bar=True)
    best_f1 = study.best_value
    logger.info(f"Max f1-score: {best_f1}")

    # y_true = test_g_df["label"].values
    y_pred = calc_soft_threshold(X_test.values, **study.best_params)
    print(study.best_params)
    print(best_f1, f1_score(y_test, y_pred))

In [80]:
dynamic_threshold_method(X, y)

Train Percent HS Users: 0.2475
Test Percent HS Users: 0.25


100%|██████████| 1000/1000 [00:05<00:00, 173.33it/s]

[32m2023-02-13 01:21:51,409 - INFO     - user_level_simple_models - Max f1-score: 0.5355371900826447[0m
{'lower_bound': 0.1384709495203148, 'higher_bound': 0.20958431397356525, 'low_th': 3, 'medium_th': 12, 'high_th': 20}
0.5355371900826447 0.5228758169934641



