<a href="https://colab.research.google.com/github/JesseTNRoberts/AAAI-paper-2024/blob/main/Structural_Priming_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from tqdm import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from pandas._libs.lib import is_timedelta_or_timedelta64_array
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import ast
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

loc = "/content/drive/MyDrive/Data/Structural Priming/"

Mounted at /content/drive


In [None]:
!pip install nbformat --upgrade
!pip install seaborn --upgrade




In [None]:
# helper functions

def undo_reduction(df, column):
  # for sum
  #select_reduction = 0
  # for mean
  select_reduction = 1

  df[column] = df.apply(
    lambda row: [t[select_reduction] for t in row[column]],
    axis=1)

def str_num(df, column):
  df[column] = df[column].apply(ast.literal_eval)

def explode_and_label(df, cols):
  df = df.explode(cols)
  df['member id'] = df.groupby([df.index.get_level_values(0)]).cumcount()+1
  df['experiment'] = df.index.values + 1
  return df

In [None]:
# Read in all data for all models

models = ['bert-base-uncased', 'bert-large-uncased', 'distilbert-base-uncased', 'distilgpt2',
          'gpt2-medium', 'gpt2', 'openai-gpt', 'roberta-base', 'roberta-large']

result_columns = ["tx results",	"ty results",	"px-tx results",	"px-ty results",	"py-tx results",	"py-ty results"]

base_model_trans_df_s = {}
pop_trans_df_s = {}
base_model_dative_df_s = {}
pop_dative_df_s = {}

for model in tqdm(models):

  base_model_trans_df = pd.read_csv(loc+"CORE_transitive_1500sampled_recreate/"+model+".csv")
  base_model_trans_df.drop(base_model_trans_df.columns[[0, 1, 2, 3]], axis=1, inplace=True)
  for col in result_columns:
    str_num(base_model_trans_df, col)
    undo_reduction(base_model_trans_df, col)
  base_model_trans_df = explode_and_label(base_model_trans_df, result_columns)
  base_model_trans_df_s[model] = base_model_trans_df

  pop_trans_df = pd.read_csv(loc+"CORE_transitive_1500sampled_popLM/"+model+".csv")
  pop_trans_df.drop(pop_trans_df.columns[[0, 1, 2, 3]], axis=1, inplace=True)
  for col in result_columns:
    str_num(pop_trans_df, col)
    undo_reduction(pop_trans_df, col)
  pop_trans_df = explode_and_label(pop_trans_df, result_columns)
  pop_trans_df_s[model] = pop_trans_df


  base_model_dative_df = pd.read_csv(loc+"CORE_dative_1500sampled_recreate/"+model+".csv")
  base_model_dative_df.drop(base_model_dative_df.columns[[0, 1, 2, 3]], axis=1, inplace=True)
  for col in result_columns:
    str_num(base_model_dative_df, col)
    undo_reduction(base_model_dative_df, col)
  base_model_dative_df = explode_and_label(base_model_dative_df, result_columns)
  base_model_dative_df_s[model] = base_model_dative_df

  pop_dative_df = pd.read_csv(loc+"CORE_dative_1500sampled_popLM/"+model+".csv")
  pop_dative_df.drop(pop_dative_df.columns[[0, 1, 2, 3]], axis=1, inplace=True)
  for col in result_columns:
    str_num(pop_dative_df, col)
    undo_reduction(pop_dative_df, col)
  pop_dative_df = explode_and_label(pop_dative_df, result_columns)
  pop_dative_df_s[model] = pop_dative_df


100%|██████████| 9/9 [11:36<00:00, 77.41s/it]


In [None]:
# Find whether structural priming is preferred and if it is, if it is unique in mean or in
# correlation from an alternate treatment.

import plotly.graph_objects as go
from scipy.stats import mannwhitneyu, wilcoxon, kstest, ttest_rel
from scipy.stats import hmean
import scipy

def conf(df, conf=0.95):
  return scipy.stats.t.interval(conf,
                       df=len(df)-1,
                       loc=np.mean(df),
                       scale=scipy.stats.sem(df))

max = len(pop_trans_df['px-tx results'])*(len(pop_trans_df['px-tx results'])+1)/2
print('max = ', max)


for model in models:

  df = pop_trans_df_s[model] + pop_dative_df_s[model]
  print('\n#' + model)
  # print('\n## ----- transitive')
  this = wilcoxon(df['px-tx results'] - df['tx results'], alternative='greater')
  print("- ", this.statistic/max, this.pvalue)
  this = wilcoxon(df['py-tx results'] - df['tx results'],  alternative='greater')
  print("- ", this.statistic/max, this.pvalue)
  this = wilcoxon(df['px-tx results'] - df['py-tx results'],  alternative='greater')
  print("- ", this.statistic/max, this.pvalue)



  control = np.array(conf(df['tx results']))
  placebo = np.array(conf(df['py-tx results']))
  treat = np.array(conf(df['px-tx results']))
  print('\n## ----- confidence trans py-tx px-tx')
  print('- ', (placebo-control)/(treat-control))
  this = scipy.stats.pearsonr(df['px-tx results'] - df['tx results'],
                             df['py-tx results'] - df['tx results'])
  print(this[0]**2, this[1])



max =  2812537500.0

#bert-base-uncased
-  0.273812058683662 1.0
-  0.352764803491509 1.0
-  0.3775605948365133 1.0

## ----- confidence trans py-tx px-tx
-  [0.6274854  0.63070723]
0.5376026606590216 0.0

#bert-large-uncased
-  0.4696474041679444 1.0
-  0.5560376935418638 5.614045323682645e-156
-  0.36578964920467727 1.0

## ----- confidence trans py-tx px-tx
-  [-3.53537089 -3.08856221]
0.6510125812799225 0.0

#distilbert-base-uncased
-  0.6661731608911882 0.0
-  0.5932271941263005 0.0
-  0.601153203823949 0.0

## ----- confidence trans py-tx px-tx
-  [0.54523449 0.54067282]
0.5232654479873846 0.0

#distilgpt2
-  0.9969429049390452 0.0
-  0.9950551085985521 0.0
-  0.6646819601516424 0.0

## ----- confidence trans py-tx px-tx
-  [0.9315008  0.93140543]
0.8046147590227647 0.0

#gpt2-medium
-  0.9999998339577695 0.0
-  0.9999999598227579 0.0
-  0.6416382826895641 0.0

## ----- confidence trans py-tx px-tx
-  [0.97525934 0.9752379 ]
0.8922768195949555 0.0

#gpt2
-  0.9999898685795301 0.0

In [None]:
# Evaluate each population against the base model via Kolmogorov-Smirnov test

for model in models:
  print('\n'+model+'\n')
  print(kstest(pop_trans_df_s[model]['tx results']+pop_dative_df_s[model]['tx results'],
  base_model_trans_df_s[model]['tx results']+base_model_dative_df_s[model]['tx results']), '\n')


bert-base-uncased

KstestResult(statistic=0.03454666666666667, pvalue=0.05833742509704809, statistic_location=-10.202858262591892, statistic_sign=-1) 


bert-large-uncased

KstestResult(statistic=0.047746666666666715, pvalue=0.0023623477532004, statistic_location=-9.531695577833387, statistic_sign=-1) 


distilbert-base-uncased

KstestResult(statistic=0.037733333333333285, pvalue=0.029553696456558254, statistic_location=-10.007999049292671, statistic_sign=1) 


distilgpt2

KstestResult(statistic=0.4493866666666667, pvalue=1.380799007307408e-271, statistic_location=-15.60033655166626, statistic_sign=-1) 


gpt2-medium

KstestResult(statistic=0.5112933333333334, pvalue=0.0, statistic_location=-17.346627712249756, statistic_sign=-1) 


gpt2

KstestResult(statistic=0.2940533333333334, pvalue=3.350859531207987e-113, statistic_location=-16.396191596984863, statistic_sign=-1) 


openai-gpt

KstestResult(statistic=0.08375999999999995, pvalue=1.9956759460398536e-09, statistic_location=-11.0928

In [None]:
# Generate validation on alternate 1500 y data points

import plotly.graph_objects as go
from scipy.stats import mannwhitneyu, wilcoxon, kstest, ttest_rel
from scipy.stats import hmean
import scipy

def conf(df, conf=0.95):
  return scipy.stats.t.interval(conf,
                       df=len(df)-1,
                       loc=np.mean(df),
                       scale=scipy.stats.sem(df))

max = len(pop_trans_df['py-ty results'])*(len(pop_trans_df['py-ty results'])+1)/2
print('max = ', max)


for model in models:

  df = pop_trans_df_s[model] + pop_dative_df_s[model]
  print('\n#' + model)
  # print('\n## ----- transitive')
  this = wilcoxon(df['py-ty results'] - df['ty results'], alternative='greater')
  print("- ", this.statistic/max, this.pvalue)
  this = wilcoxon(df['px-ty results'] - df['ty results'],  alternative='greater')
  print("- ", this.statistic/max, this.pvalue)
  this = wilcoxon(df['py-ty results'] - df['px-ty results'],  alternative='greater')
  print("- ", this.statistic/max, this.pvalue)




  control = np.array(conf(df['ty results']))
  placebo = np.array(conf(df['px-ty results']))
  treat = np.array(conf(df['py-ty results']))
  print('\n## ----- confidence trans px-ty py-ty')
  print('- ', (placebo-control)/(treat-control))
  this = scipy.stats.pearsonr(df['py-ty results'] - df['ty results'],
                             df['px-ty results'] - df['ty results'])
  print(this[0]**2, this[1])




max =  2812537500.0

#bert-base-uncased
-  0.40885031168473307 1.0
-  0.4565637382612676 1.0
-  0.43248545041621667 1.0

## ----- confidence trans px-ty py-ty
-  [0.41560044 0.42214692]
0.49178266134255183 0.0

#bert-large-uncased
-  0.43254453549508226 1.0
-  0.5610080459371652 1.943746182933203e-184
-  0.32222314102478633 1.0

## ----- confidence trans px-ty py-ty
-  [-1.18515437 -1.09981606]
0.5858795964024965 0.0

#distilbert-base-uncased
-  0.8550128837393279 0.0
-  0.8337512776274094 0.0
-  0.5578891840553237 2.6869636151108007e-166

## ----- confidence trans px-ty py-ty
-  [0.8950765 0.8959155]
0.5195603219162974 0.0

#distilgpt2
-  0.9996895769389742 0.0
-  0.9992065124465007 0.0
-  0.7403876342982093 0.0

## ----- confidence trans px-ty py-ty
-  [0.90990689 0.90994315]
0.7837810441958102 0.0

#gpt2-medium
-  0.9999999960889411 0.0
-  0.9999998784016213 0.0
-  0.843592387479278 0.0

## ----- confidence trans px-ty py-ty
-  [0.93590823 0.93595146]
0.8685510235078653 0.0

#gpt2
-