In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
HOME_PATH = "./gdrive/MyDrive/magisterka/"

In [None]:
%%capture
!pip install pickle5

# 1. Prepare data

In [None]:
import pickle5 as pickle
import pandas as pd
from os.path import exists

def read_embeddings():
  if exists(f"{HOME_PATH}df_embeddings_splitted.pkl"):
    df_embedding_splited = pd.read_pickle(f"{HOME_PATH}df_embeddings_splitted.pkl")
  else:
      with open(HOME_PATH + "df_comments_embeddings.pkl", 'rb') as f:
        df_embedding = pickle.load(f)
        df_embedding.rename(columns={'comment': 'embedding'}, inplace=True)
        df_embedding = df_embedding[['rev_id', 'embedding']]
      df_embedding_splited = df_embedding.join(pd.DataFrame(df_embedding.pop('embedding').to_list(), columns=list(range(100))))
      df_embedding_splited.to_pickle(f"{HOME_PATH}df_embeddings_splitted.pkl")
  return df_embedding_splited


def read_annotation_comment(n_clusters, name, variant, algo):
  df_annotations = pd.read_pickle(f"{HOME_PATH}df_anno_{name}.pkl")
  df_comments = pd.read_pickle(f"{HOME_PATH}df-comments-clusters{algo}-{variant}-{n_clusters}-{name}.pkl") 
  df_anno_comm = pd.merge(df_annotations, df_comments, on="rev_id")

  return df_anno_comm


In [None]:
# METRICS
# columns: |topic	worker_id	0, 1, .... 13|

# metric 1
def avg_agression_per_topic(df_annotations_comments):
  return df_annotations_comments.pivot_table(index='worker_id',
                                             columns='topic',
                                             values="aggression",
                                             aggfunc='mean',
                                             fill_value=0.5)\
                                .reset_index() # .stack() for row-wise grouping

# metric 2
def avg_agression_score_per_topic(df_annotations_comments):
  return df_annotations_comments.pivot_table(index='worker_id',
                                      columns='topic',
                                      values="aggression_score",
                                      aggfunc='mean',
                                      fill_value=0)\
                                .reset_index()

# metric 3
def avg_agression_score_per_topic_normalized(df_annotations_comments):
  df = df_annotations_comments.pivot_table(index='worker_id',
                                      columns='topic',
                                      values="aggression_score",
                                      aggfunc='mean',
                                      fill_value=0)\
                              .reset_index()
  df.iloc[:,1:] = df.iloc[:,1:].apply(lambda x: (x-x.min()) / (x.max()-x.min()), axis=0)
  return df

# metric 4
def mean_std_agression_score_per_topic(df_annotations_comments):
  df_mean = df_annotations_comments.pivot_table(index='worker_id',
                                      columns='topic',
                                      values="aggression_score",
                                      aggfunc='mean',
                                      fill_value=0)\
                              .reset_index()
  
  df_std = df_annotations_comments.pivot_table(index='worker_id',
                                    columns='topic',
                                    values="aggression_score",
                                    aggfunc='std',
                                    fill_value=0)\
                            .reset_index()


  df = pd.merge(df_mean, df_std,  on="worker_id")

  return df


In [None]:

def join_metric_to_annotation(n_clusters, metric_func, variant, algo):

  # worker metrics
  df_annotations_comments_dev = read_annotation_comment(n_clusters, "dev", variant, algo)
  metrics_df = metric_func(df_annotations_comments_dev)
  new_columns = {i: f"topic_{i}" for i in range(n_clusters)}
  metrics_df.rename(columns=new_columns, inplace=True)

  # join with annotation
  def join(metrics_df, name, n_clusters):
      df_annotations = read_annotation_comment(n_clusters, name, variant, algo)
      df_anno_metric = pd.merge(df_annotations, metrics_df, on="worker_id")
      return df_anno_metric

  df_annotations_train = join(metrics_df, "train", n_clusters)
  df_annotations_test = join(metrics_df, "test", n_clusters)
  return df_annotations_train, df_annotations_test

def join_embedding(df):
  df_embedding = read_embeddings()
  return pd.merge(df, df_embedding, on="rev_id")

def prepare_data(n_clusters, metric_func, variant, is_baseline=False, algo=""):
  df_anno_metric_train, df_anno_metric_test = join_metric_to_annotation(n_clusters, metric_func, variant, algo)
  df_train = join_embedding(df_anno_metric_train)
  df_test = join_embedding(df_anno_metric_test)

  def get_X_y(df):
    if is_baseline:
      features_start = 10 + n_clusters
    else:
      features_start = 10
    X = df.iloc[:, features_start:].to_numpy()
    y = df.iloc[:, 2].to_numpy()
    return X, y

  X_train, y_train = get_X_y(df_train)

  X_test, y_test = get_X_y(df_test)

  return X_train, y_train, X_test, y_test


# 2. Modeling - logistic regression

**baseline**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, y_train, X_test, y_test = prepare_data(2, avg_agression_score_per_topic, 'micro', True)
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

print(f"TRAIN: {clf.score(X_train, y_train)}")
print(f"TEST: {clf.score(X_test, y_test)}") # tyle co w pracy kocon2021offensive"

report = classification_report(y_test, clf.predict(X_test), output_dict=True)
df = pd.DataFrame(report).transpose()


TRAIN: 0.8547325269088633
TEST: 0.8481622036086226


In [None]:
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.859181,0.973485,0.912768,252230.0
1.0,0.713048,0.292264,0.414594,56863.0
accuracy,0.848162,0.848162,0.848162,0.848162
macro avg,0.786114,0.632874,0.663681,309093.0
weighted avg,0.832298,0.848162,0.821121,309093.0


In [None]:
df_all = df.copy()

**HDBSCAN**


In [None]:
k=3
algo="-hdbscan"
for func in [avg_agression_per_topic, avg_agression_score_per_topic, avg_agression_score_per_topic_normalized, mean_std_agression_score_per_topic]:
  X_train, y_train, X_test, y_test = prepare_data(k, func, 'micro', False, algo)
  clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
  print(f"K = {k}, {func.__name__}")
  print(f"TEST: {clf.score(X_test, y_test):.4}")

  report = classification_report(y_test, clf.predict(X_test), output_dict=True)
  df = pd.DataFrame(report).transpose()
  df['model'] = f'lr-hdbscan-micro-{k}-{func.__name__}'
  df_all = pd.concat([df_all, df])


K = 3, avg_agression_per_topic
TEST: 0.8561
K = 3, avg_agression_score_per_topic
TEST: 0.8511
K = 3, avg_agression_score_per_topic_normalized
TEST: 0.8511
K = 3, mean_std_agression_score_per_topic
TEST: 0.8516


**topic-based MACRO**

In [None]:
for k in [2, 4, 6]:
  for func in [avg_agression_per_topic, avg_agression_score_per_topic, avg_agression_score_per_topic_normalized, mean_std_agression_score_per_topic]:
    X_train, y_train, X_test, y_test = prepare_data(k, func, 'macro', False)
    clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    print(f"K = {k}, {func.__name__}")
    print(f"TEST: {clf.score(X_test, y_test):.4}")

    report = classification_report(y_test, clf.predict(X_test), output_dict=True)
    df = pd.DataFrame(report).transpose()
    df['model'] = f'lr-macro-{k}-{func.__name__}'
    df_all = pd.concat([df_all, df])

**topic-based MICRO**

In [None]:
for k in [2, 5, 13]:
  for func in [avg_agression_per_topic, avg_agression_score_per_topic, avg_agression_score_per_topic_normalized, mean_std_agression_score_per_topic]:
    X_train, y_train, X_test, y_test = prepare_data(k, func, 'micro', False)
    clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    print(f"K = {k}")
    print(f"TEST: {clf.score(X_test, y_test):.4}")

    report = classification_report(y_test, clf.predict(X_test), output_dict=True)
    df = pd.DataFrame(report).transpose()
    df['model'] = f'lr-micro-{k}-{func.__name__}'
    df_all = pd.concat([df_all, df])


In [None]:
df_all.to_pickle(f"{HOME_PATH}lr_results.pkl")

In [None]:
import pandas as pd
df_all = pd.read_pickle(f"{HOME_PATH}lr_results.pkl")

In [None]:
df_all

Unnamed: 0,precision,recall,f1-score,support,model
0.0,0.859181,0.973485,0.912768,252230.000000,baseline
1.0,0.713048,0.292264,0.414594,56863.000000,baseline
accuracy,0.848162,0.848162,0.848162,0.848162,baseline
macro avg,0.786114,0.632874,0.663681,309093.000000,baseline
weighted avg,0.832298,0.848162,0.821121,309093.000000,baseline
...,...,...,...,...,...
0.0,0.862732,0.972969,0.914541,252230.000000,lr-hdbscan-micro-3-mean_std_agression_score_pe...
1.0,0.723228,0.313314,0.437219,56863.000000,lr-hdbscan-micro-3-mean_std_agression_score_pe...
accuracy,0.851614,0.851614,0.851614,0.851614,lr-hdbscan-micro-3-mean_std_agression_score_pe...
macro avg,0.792980,0.643142,0.675880,309093.000000,lr-hdbscan-micro-3-mean_std_agression_score_pe...


In [None]:
df_lr = pd.read_pickle(f"{HOME_PATH}lr_results.pkl")
report_all = df_lr.copy()
report_all = report_all.reset_index()
report_all= report_all.rename(columns={"index": "metric"})
report_all['model_type'] = report_all['model'].str.split('r-', 2).str[0]
report_all['model_type'].unique()

array(['baseline', 'l'], dtype=object)

In [None]:
report_accuracy = report_all.loc[report_all['metric'] == 'macro avg', :]
report_accuracy = report_accuracy.loc[report_accuracy.groupby(['model_type'])['f1-score'].idxmax()]
report_accuracy

Unnamed: 0,metric,precision,recall,f1-score,support,model,model_type
3,macro avg,0.786114,0.632874,0.663681,309093.0,baseline,baseline
103,macro avg,0.798491,0.663789,0.698416,309093.0,lr-micro-5-avg_agression_per_topic,l


In [None]:
report_all.loc[report_all['model'] == 'lr-micro-5-avg_agression_per_topic']

Unnamed: 0,metric,precision,recall,f1-score,support,model,model_type
100,0.0,0.870108,0.969682,0.917201,252230.0,lr-micro-5-avg_agression_per_topic,l
101,1.0,0.726873,0.357895,0.479631,56863.0,lr-micro-5-avg_agression_per_topic,l
102,accuracy,0.857134,0.857134,0.857134,0.857134,lr-micro-5-avg_agression_per_topic,l
103,macro avg,0.798491,0.663789,0.698416,309093.0,lr-micro-5-avg_agression_per_topic,l
104,weighted avg,0.843757,0.857134,0.836702,309093.0,lr-micro-5-avg_agression_per_topic,l


In [None]:
# print latex-format
for variant in ["macro-2", "macro-4", "macro-6", "micro-2", "micro-5", "micro-13"]:
    res = ''
    for func in [avg_agression_per_topic, avg_agression_score_per_topic, mean_std_agression_score_per_topic]:
      res += f'\t{df_all.loc[df_all["model"] == f"lr-{variant}-{func.__name__}", ["recall"]].iloc[1, 0]:.3}'
    print(res)


	0.352	0.312	0.321
	0.355	0.315	0.327
	0.356	0.315	0.328
	0.354	0.31	0.323
	0.358	0.32	0.332
	0.358	0.319	0.334
	0.35	0.309	0.313
