In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import tqdm
from scipy.stats import pearsonr as pcorr
import itertools
import re
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing

# from src.features import metric_exploration

In [2]:
PATH_ROOT = "/home/shaul/workspace/GitHub/SOTA/"

In [3]:
cd {PATH_ROOT}

/home/shaul/workspace/GitHub/SOTA


In [4]:
df = pd.read_csv('/home/shaul/workspace/GitHub/SOTA/data/combined/with_annotators/combined_dataset.csv', index_col= 0)

In [5]:
if "annotator" not in df.columns:
    # Older version of the combined dataset with the annotator
    df2 = pd.read_csv('/home/shaul/workspace/GitHub/SOTA/data/combined_dataset.csv.1', index_col = 0)
    df['annotator'] = df2.annotator

In [7]:
non_metric_columns = ['text1','text2','label','dataset','random','duration','total_seconds','pair_id','reduced_label','annotator']

def get_corr(df: pd.DataFrame, bad_annotator: list) -> dict:
    '''
    Get the correlation between the various metrics and the human labeling filtering out particular "bad annotators"

    parameters:
        df -- {pd.DataFrame} -- combined dataset
        bad_annotator -- {list} -- list of all the annotator ID's we want to filter out

    return:
        {pd.DataFrame} - correlations by each dataset of metric and human label
        {pd.DataFrame} - correlations by each dataset of metric and reduced human label (-1,0,1)
        {pd.Series} - correlations of all datasets of metric and human label
        {pd.Series} - correlations of all datasets of metris and reduced human label
    '''

    if bad_annotator:
        df = df[~df.annotator.isin(bad_annotator)]
        #Remove all pairs if there is only one annotator
        df = df.groupby('pair_id').filter(lambda x: x.annotator.count() >= 2)

    metrics = [x for x in df.columns if x not in non_metric_columns]
    all_labels = metrics + ['label'] + ['reduced_label']
    df = df.groupby(['pair_id','dataset','random'])[all_labels].mean().reset_index()

    label_corr = dict()
    reduced_label_corr = dict()

    #Iterate through the datasets and get the correlation of each metric with label & reduced label (separately)
    for name,group in df.groupby('dataset'):
        label_corr[name] = group[metrics].corrwith(group['label'])
        reduced_label_corr[name] = group[metrics].corrwith(group['reduced_label'])

    combined_datasets_label_corr = df[metrics].corrwith(df['label'])
    combined_datasets_reduced_label_corr = df[metrics].corrwith(df['reduced_label'])

    random_label_corr = dict()
    random_reduced_label_corr = dict()

    for name,group in df.groupby('random'):
        random_label_corr[name] = group[metrics].corrwith(group['label'])
        random_reduced_label_corr[name] = group[metrics].corrwith(group['reduced_label'])

    correlations_dict = dict()
    correlations_dict['label_by_dataset'] = pd.DataFrame.from_dict(label_corr).T
    correlations_dict['reduced_label_by_dataset'] = pd.DataFrame.from_dict(reduced_label_corr).T
    correlations_dict['label_by_random'] = pd.DataFrame.from_dict(random_label_corr).T 
    correlations_dict['reduced_label_by_random'] = pd.DataFrame.from_dict(random_reduced_label_corr).T
    correlations_dict['label_by_combined'] = pd.Series(combined_datasets_label_corr)
    correlations_dict['reduced_label_by_combined'] = pd.Series(combined_datasets_reduced_label_corr)
    return correlations_dict

In [8]:
test = get_corr(df,None)

In [9]:
test['label_by_dataset']

Unnamed: 0,bleu,bleu1,glove_cosine,fasttext_cosine,BertScore,chrfScore,POS Dist score,1-gram_overlap,ROUGE-1,ROUGE-2,ROUGE-l
bible_human,0.301781,0.361418,-0.258113,-0.30352,0.343436,0.34991,-0.249164,0.343448,0.359654,0.341609,0.362266
bible_random_human,0.018227,0.081158,-0.058559,-0.0743,0.068136,0.045148,-0.036966,0.049009,0.048639,0.028168,0.059865
gyafc_formal_human,0.269344,0.345601,-0.263208,-0.261788,0.372827,0.315667,-0.161177,0.33523,0.334441,0.290246,0.336569
gyafc_formal_random_human,0.087416,0.064611,-0.064726,-0.059007,-0.019708,0.116585,0.007046,0.040632,0.054774,0.036777,0.054641
gyafc_informal_human,0.105411,0.252342,-0.137173,-0.145949,0.272675,0.237491,-0.138669,0.258827,0.242551,0.194514,0.25964
gyafc_informal_random_human,0.119732,0.052296,0.012553,-0.01048,0.015643,0.067276,-0.035363,0.049867,0.04488,0.070819,0.041311
gyafc_rewrites_human,0.257822,0.352715,-0.256443,-0.248679,0.397966,0.361496,-0.211251,0.33594,0.350455,0.30524,0.372787
gyafc_rewrites_random_human,0.006209,0.000246,-0.015786,-0.039174,0.053218,0.012654,0.03268,-0.007742,-0.008345,0.00394,-0.016364
paralex_human,0.146934,0.230694,-0.206681,-0.211279,0.352156,0.251824,-0.045839,0.222752,0.236598,0.169609,0.24192
paralex_random_human,0.036077,0.047361,0.000411,-0.022915,0.01493,0.048666,-0.023046,0.059672,0.057712,0.070155,0.06078


In [10]:
with open('/home/shaul/workspace/GitHub/SOTA/data/other/ba_all.txt','r+') as f:
    list_ba = f.read().splitlines() 

In [11]:
dict_baseline = get_corr(df,None)
dict_filtered = get_corr(df,list_ba)

In [12]:
def compare_correlations(dict_baseline, dict_filtered):

    ab_dict = dict()

    for key in dict_baseline.keys():
        ab_dict[key] = dict_filtered[key] - dict_baseline[key]

    return ab_dict

In [13]:
ab_dict = compare_correlations(dict_baseline,dict_filtered)

In [14]:
ab_dict['reduced_label_by_random']

Unnamed: 0,bleu,bleu1,glove_cosine,fasttext_cosine,BertScore,chrfScore,POS Dist score,1-gram_overlap,ROUGE-1,ROUGE-2,ROUGE-l
0,-0.008881,0.007736,-0.018534,-0.037464,-0.003839,0.000465,0.030275,-0.004488,0.00318,0.007254,0.007926
1,-0.051604,-0.043984,0.034127,0.045167,0.006353,-0.050304,0.005076,-0.048799,-0.038395,-0.049176,-0.030126


### Look at the Non-Linear and Linear Models

In [17]:
metrics = [x for x in df.columns if x not in non_metric_columns]
all_labels = metrics + ['label'] + ['reduced_label']

df = df.groupby(['pair_id','dataset','random'])[all_labels].mean().reset_index()

In [19]:
data = df.drop(["label","pair_id","dataset","random","reduced_label"], axis=1).copy()

labels = df["label"]
labels_reduced = df['reduced_label']

column_names = list(data.columns) 
x = data.values #returns a numpy array

#scale the data values
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
x_scaled = min_max_scaler.fit_transform(x)
data = pd.DataFrame(x_scaled, columns=column_names)


In [None]:
def run_RF(df: pd.DataFrame, metrics: list)

    all_labels = metrics + ['label'] + ['reduced_label']

    df = df.groupby(['pair_id','dataset','random'])[all_labels].mean().reset_index()
    data = df.drop(['label','pair_id","dataset','random','reduced_label'], axis=1).copy()

    #TODO: This hasn't been written to explore the scores grouped by dataset and random

    labels = df["label"]
    labels_reduced = df['reduced_label']

    column_names = list(data.columns) 
    x = data.values #returns a numpy array

    #scale the data values
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
    x_scaled = min_max_scaler.fit_transform(x)
    data = pd.DataFrame(x_scaled, columns=column_names)
    
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(data, labels_reduced, test_size=0.2)

    scores = dict()

    #Get the score from the models

    model1 = RandomForestRegressor(max_depth=3)
    model1.fit(X_train, y_train)
    y_pred = model1.predict(X_test)
    scores['combined_RF_label'] = mean_squared_error(y_test, y_pred)

    model2 = RandomForestRegressor(max_depth=3)
    model2.fit(X_train_reduced, y_train_reduced)
    y_pred_reduced = model2.predict(X_test_reduced)
    scores['combined_RF_label_reduced'] = mean_squared_error(y_test_reduced, y_pred_reduced)

    feature_importance_dict = {'feature': data.columns.values, 'importance':model1.feature_importances_} 
    feature_importance_df['combined_RF_features'] = pd.DataFrame(feature_importance_dict).sort_values('importance', ascending=False) 

    #Repeat for each of the datasets and by random