In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

import numpy as np


In [5]:
%run 02_Daten_laden.ipynb

# Functions

In [61]:
def calculateBootstrap(y_true, y_pred_base, y_pred_pe):

    
    # Sicherstellen, dass alles integer ist
    y_true = y_true.astype(int)
    y_pred_base = y_pred_base.astype(int)
    y_pred_pe = y_pred_pe.astype(int)

    # Bootstrap-Parameter
    n_iterations = 1000
    n_size = len(y_true)
    f1_diffs = []
    
    # Schritt 2: Bootstrap-Schleife
    for _ in range(n_iterations):
        idx = np.random.choice(range(n_size), size=n_size, replace=True)
        f1_base = f1_score(y_true[idx], y_pred_base[idx])
        f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
        f1_diffs.append(f1_one_shot - f1_base)
    
    # Schritt 3: Konfidenzintervall berechnen
    ci_lower = np.percentile(f1_diffs, 2.5)
    ci_upper = np.percentile(f1_diffs, 97.5)
    
    print(f"95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")
    
    # Schritt 4: Interpretation
    if ci_lower > 0:
        print("✅ OneShot signifikant besser als Base (F1).")
    elif ci_upper < 0:
        print("❌ Base signifikant besser als OneShot (F1).")
    else:
        print("➖ Kein signifikanter Unterschied beim F1.")

In [63]:
def calculateBootstrapMulti(y_true, y_pred_base, y_pred_pe):

    
    # Sicherstellen, dass alles integer ist
    y_true = y_true.astype(int)
    y_pred_base = y_pred_base.astype(int)
    y_pred_pe = y_pred_pe.astype(int)

    # Bootstrap-Parameter
    n_iterations = 1000
    n_size = len(y_true)
    f1_diffs = []
    
    # Schritt 2: Bootstrap-Schleife
    for _ in range(n_iterations):
        idx = np.random.choice(range(n_size), size=n_size, replace=True)
        f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')
        f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
        f1_diffs.append(f1_one_shot - f1_base)
    
    # Schritt 3: Konfidenzintervall berechnen
    ci_lower = np.percentile(f1_diffs, 2.5)
    ci_upper = np.percentile(f1_diffs, 97.5)
    
    print(f"95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")
    
    # Schritt 4: Interpretation
    if ci_lower > 0:
        print("✅ OneShot signifikant besser als Base (F1).")
    elif ci_upper < 0:
        print("❌ Base signifikant besser als OneShot (F1).")
    else:
        print("➖ Kein signifikanter Unterschied beim F1.")

# SST2

## Baseline

### GPT

In [7]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_BASELINE_GPT.csv',sep=';')
# sst2 gemini
#df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_BASELINE_GEMINI.csv',sep=';')

df = df[df['generated_label'].isin(['0', '1',0,1])]

df['generated_label'] = df['generated_label'].astype(int)

df

Unnamed: 0.1,Unnamed: 0,index,sentence,generated_label
0,0,0,hide new secretions from the parental units,0
1,1,1,"contains no wit , only labored gags",0
2,2,2,that loves its characters and communicates som...,1
3,3,3,remains utterly satisfied to remain the same t...,1
4,4,4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...,...
992,995,995,you wish you were at home watching that movie ...,0
993,996,996,'s no point in extracting the bare bones of by...,0
994,997,997,underdeveloped,0
995,998,998,the jokes are flat,0


In [8]:
# Bilde ein Subset mit 1.000 Einträgen
sst2_subset = sst2_combined[:1000]
sst2_subset

# Beide DataFrames anhand der 'index'-Spalte mergen
df_combined_base_sst2_gpt = pd.merge(sst2_subset, df, on='index')


In [18]:
df_combined_base_sst2_gpt.head()

Unnamed: 0.1,index,sentence_x,label,Unnamed: 0,sentence_y,generated_label
0,0,hide new secretions from the parental units,0,0,hide new secretions from the parental units,0
1,1,"contains no wit , only labored gags",0,1,"contains no wit , only labored gags",0
2,2,that loves its characters and communicates som...,1,2,that loves its characters and communicates som...,1
3,3,remains utterly satisfied to remain the same t...,0,3,remains utterly satisfied to remain the same t...,1
4,4,on the worst revenge-of-the-nerds clichés the ...,0,4,on the worst revenge-of-the-nerds clichés the ...,0


### Gemini

In [9]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_BASELINE_GEMINI.csv',sep=';')

df = df[df['generated_label'].isin(['0', '1',0,1])]

df['generated_label'] = df['generated_label'].astype(int)

df

Unnamed: 0,index,sentence,generated_label
0,0,hide new secretions from the parental units,0
1,1,"contains no wit , only labored gags",0
2,2,that loves its characters and communicates som...,1
3,3,remains utterly satisfied to remain the same t...,0
4,4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...
984,995,you wish you were at home watching that movie ...,0
985,996,'s no point in extracting the bare bones of by...,0
986,997,underdeveloped,0
987,998,the jokes are flat,0


In [10]:
# Beide DataFrames anhand der 'index'-Spalte mergen
df_combined_base_sst2_gemini = pd.merge(sst2_subset, df, on='index')
df_combined_base_sst2_gemini

Unnamed: 0,index,sentence_x,label,sentence_y,generated_label
0,0,hide new secretions from the parental units,0,hide new secretions from the parental units,0
1,1,"contains no wit , only labored gags",0,"contains no wit , only labored gags",0
2,2,that loves its characters and communicates som...,1,that loves its characters and communicates som...,1
3,3,remains utterly satisfied to remain the same t...,0,remains utterly satisfied to remain the same t...,0
4,4,on the worst revenge-of-the-nerds clichés the ...,0,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...,...,...
980,995,you wish you were at home watching that movie ...,0,you wish you were at home watching that movie ...,0
981,996,'s no point in extracting the bare bones of by...,0,'s no point in extracting the bare bones of by...,0
982,997,underdeveloped,0,underdeveloped,0
983,998,the jokes are flat,0,the jokes are flat,0


## One-Shot

### GPT

In [46]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_ONE_SHOT_GPT.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1])]

df['generated_label'] = df['generated_label'].astype(int)

df

Unnamed: 0.1,Unnamed: 0,index,sentence,generated_label
0,0,0,hide new secretions from the parental units,0
1,1,1,"contains no wit , only labored gags",0
2,2,2,that loves its characters and communicates som...,1
3,3,3,remains utterly satisfied to remain the same t...,1
4,4,4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...,...
995,995,995,you wish you were at home watching that movie ...,0
996,996,996,'s no point in extracting the bare bones of by...,0
997,997,997,underdeveloped,0
998,998,998,the jokes are flat,0


In [26]:
# Beide DataFrames anhand der 'index'-Spalte mergen
df_one_shot_sst2_gpt = pd.merge(sst2_subset, df, on='index')



### Gemini

In [138]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_ONE_SHOT_GEMINI.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1])]

df['generated_label'] = df['generated_label'].astype(int)

df

Unnamed: 0.1,Unnamed: 0,index,sentence,generated_label
0,0,0,hide new secretions from the parental units,0
1,1,1,"contains no wit , only labored gags",0
2,2,2,that loves its characters and communicates som...,1
3,3,3,remains utterly satisfied to remain the same t...,0
4,4,4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...,...,...
977,977,995,you wish you were at home watching that movie ...,0
978,978,996,'s no point in extracting the bare bones of by...,0
979,979,997,underdeveloped,0
980,980,998,the jokes are flat,0


In [140]:
# Beide DataFrames anhand der 'index'-Spalte mergen
df_one_shot_sst2_gemini = pd.merge(sst2_subset, df, on='index')



Accuracy: 0.89
Precision: 0.99
Recall: 0.81
F1-Score: 0.89
Confusion Matrix:
[[436   5]
 [101 440]]


## McNemar Base - One Shot GPT

In [28]:
df_merged_sst2_gpt_base_oneShot = df_combined_base_sst2_gpt[['sentence_x', 'label', 'generated_label']].merge(
    df_one_shot_sst2_gpt[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)

In [110]:
from statsmodels.stats.contingency_tables import mcnemar
import numpy as np

y_true = df_merged_sst2_gpt_base_oneShot['label'].values
y_pred_base = df_merged_sst2_gpt_base_oneShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gpt_base_oneShot['generated_label_one_shot'].values

b = np.sum((y_pred_base != y_true) & (y_pred_one_shot == y_true)) # Base falsch, OneShot korrekt
c = np.sum((y_pred_base == y_true) & (y_pred_one_shot != y_true)) # Base richtig, OneShot falsch

table = [[0, b], 
         [c, 0]]

result = mcnemar(table, exact=True)
print(f"P-Wert: {result.pvalue}")

P-Wert: 0.07894069368148846


In [130]:
y_true = df_merged_sst2_gpt_base_oneShot['label'].values
y_pred_base = df_merged_sst2_gpt_base_oneShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gpt_base_oneShot['generated_label_one_shot'].values

b = np.sum((y_pred_base != y_true) & (y_pred_one_shot == y_true)) # Base falsch, OneShot korrekt
c = np.sum((y_pred_base == y_true) & (y_pred_one_shot != y_true)) # Base richtig, OneShot falsch

table = [[0, b], 
         [c, 0]]

result = mcnemar(table, exact=True)
print(f"P-Wert: {result.pvalue}")



## Results

contingency_table = [[0, b],
                     [c, 0]]

print("Kontingenztabelle:")
print(pd.DataFrame([[0, b], [c, 0]], 
                   columns=["OneShot Falsch", "OneShot richtig"], 
                   index=["Base Falsch", "Base richtig"]))

# Schritt 4: McNemar-Test
result = mcnemar(contingency_table, exact=True)
print("\nMcNemar Test:")
print(f"Teststatistik: {result.statistic}")
print(f"P-Wert: {result.pvalue}")

# Interpretation
alpha = 0.05
if result.pvalue < alpha:
    print("✅ Signifikanter Unterschied zwischen Baseline und One-Shot.")
else:
    print("Kein signifikanter Unterschied.")

Kontingenztabelle:
              OneShot Falsch  OneShot richtig
Base Falsch                0               17
Base richtig              30                0

McNemar Test:
Teststatistik: 17.0
P-Wert: 0.07894069368148846
Kein signifikanter Unterschied.


### def Bootstrap GPT function

In [36]:
def calculateBootstrap(y_true, y_pred_base, y_pred_pe):

    
    # Sicherstellen, dass alles integer ist
    y_true = y_true.astype(int)
    y_pred_base = y_pred_base.astype(int)
    y_pred_pe = y_pred_pe.astype(int)

    # Bootstrap-Parameter
    n_iterations = 1000
    n_size = len(y_true)
    f1_diffs = []
    
    # Schritt 2: Bootstrap-Schleife
    for _ in range(n_iterations):
        idx = np.random.choice(range(n_size), size=n_size, replace=True)
        f1_base = f1_score(y_true[idx], y_pred_base[idx])
        f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
        f1_diffs.append(f1_one_shot - f1_base)
    
    # Schritt 3: Konfidenzintervall berechnen
    ci_lower = np.percentile(f1_diffs, 2.5)
    ci_upper = np.percentile(f1_diffs, 97.5)
    
    print(f"95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")
    
    # Schritt 4: Interpretation
    if ci_lower > 0:
        print("✅ OneShot signifikant besser als Base (F1).")
    elif ci_upper < 0:
        print("❌ Base signifikant besser als OneShot (F1).")
    else:
        print("➖ Kein signifikanter Unterschied beim F1.")

In [38]:
import numpy as np
from sklearn.metrics import f1_score
import pandas as pd

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gpt_base_oneShot['label'].values
y_pred_base = df_merged_sst2_gpt_base_oneShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gpt_base_oneShot['generated_label_one_shot'].values


calculateBootstrap(y_true
,y_pred_base
,y_pred_one_shot)


95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0953, -0.0536]
❌ Base signifikant besser als OneShot (F1).


## McNemar Gemini

In [172]:

## Merge data

df_merged_sst2_gemini_base_oneShot = df_combined_base_sst2_gemini[['sentence_x', 'label', 'generated_label']].merge(
    df_one_shot_sst2_gemini[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gemini_base_oneShot



Unnamed: 0,sentence_x,label,generated_label_base,generated_label_one_shot
0,hide new secretions from the parental units,0,0,0
1,"contains no wit , only labored gags",0,0,0
2,that loves its characters and communicates som...,1,1,1
3,remains utterly satisfied to remain the same t...,0,0,0
4,on the worst revenge-of-the-nerds clichés the ...,0,0,0
...,...,...,...,...
962,you wish you were at home watching that movie ...,0,0,0
963,'s no point in extracting the bare bones of by...,0,0,0
964,underdeveloped,0,0,0
965,the jokes are flat,0,0,0


In [176]:
y_true = df_merged_sst2_gemini_base_oneShot['label'].values
y_pred_base = df_merged_sst2_gemini_base_oneShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gemini_base_oneShot['generated_label_one_shot'].values

b = np.sum((y_pred_base != y_true) & (y_pred_one_shot == y_true)) # Base falsch, OneShot korrekt
c = np.sum((y_pred_base == y_true) & (y_pred_one_shot != y_true)) # Base richtig, OneShot falsch

table = [[0, b], 
         [c, 0]]

result = mcnemar(table, exact=True)
print(f"P-Wert: {result.pvalue}")



## Results

contingency_table = [[0, b],
                     [c, 0]]

print("Kontingenztabelle:")
print(pd.DataFrame([[0, b], [c, 0]], 
                   columns=["OneShot Falsch", "OneShot richtig"], 
                   index=["Base Falsch", "Base richtig"]))

# Schritt 4: McNemar-Test
result = mcnemar(contingency_table, exact=True)
print("\nMcNemar Test:")
print(f"Teststatistik: {result.statistic}")
print(f"P-Wert: {result.pvalue}")

# Interpretation
alpha = 0.05
if result.pvalue < alpha:
    print("✅ Signifikanter Unterschied zwischen Baseline und One-Shot (Gemini).")
else:
    print("Kein signifikanter Unterschied.")

P-Wert: 1.709326186236787e-09
Kontingenztabelle:
              OneShot Falsch  OneShot richtig
Base Falsch                0               54
Base richtig               8                0

McNemar Test:
Teststatistik: 8.0
P-Wert: 1.709326186236787e-09
✅ Signifikanter Unterschied zwischen Baseline und One-Shot (Gemini).


### Bootstrap Gemini

In [189]:


# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gemini_base_oneShot['label'].values
y_pred_base = df_merged_sst2_gemini_base_oneShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gemini_base_oneShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ OneShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als OneShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [0.0389, 0.0751]
✅ OneShot signifikant besser als Base (F1).


## Few-Shot

### GPT

In [284]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_FEW_SHOT_GPT.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_fewShot_sst2_gpt = pd.merge(sst2_subset, df, on='index')
# df_fewShot_sst2_gpt

# Merge with true labels


## Merge data

df_merged_sst2_gpt_base_fewShot = df_combined_base_sst2_gpt[['sentence_x', 'label', 'generated_label']].merge(
    df_fewShot_sst2_gpt[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gpt_base_fewShot


Unnamed: 0,sentence_x,label,generated_label_base,generated_label_one_shot
0,hide new secretions from the parental units,0,0,0
1,"contains no wit , only labored gags",0,0,0
2,that loves its characters and communicates som...,1,1,1
3,remains utterly satisfied to remain the same t...,0,1,0
4,on the worst revenge-of-the-nerds clichés the ...,0,0,0
...,...,...,...,...
995,you wish you were at home watching that movie ...,0,0,0
996,'s no point in extracting the bare bones of by...,0,0,0
997,underdeveloped,0,0,0
998,the jokes are flat,0,0,0


#### Bootstrap

In [271]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gpt_base_fewShot['label'].values
y_pred_base = df_merged_sst2_gpt_base_fewShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gpt_base_fewShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ FewShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als FewShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [0.0164, 0.0423]
✅ FewShot signifikant besser als Base (F1).


### Gemini

In [279]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_FEW_SHOT_GEMINI.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_fewShot_sst2_gemini = pd.merge(sst2_subset, df, on='index')
# df_fewShot_sst2_gemini

# Merge with true labels

## Merge data

df_merged_sst2_gemini_base_fewShot = df_combined_base_sst2_gemini[['sentence_x', 'label', 'generated_label']].merge(
    df_fewShot_sst2_gemini[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gemini_base_fewShot


#### Bootstrap

In [290]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gemini_base_fewShot['label'].values
y_pred_base = df_merged_sst2_gemini_base_fewShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gemini_base_fewShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ FewShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als FewShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [0.0549, 0.0931]
✅ FewShot signifikant besser als Base (F1).


## CoT

### GPT

In [24]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_CoT_GPT.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_CoT_sst2_gpt = pd.merge(sst2_subset, df, on='index')
# df_fewShot_sst2_gpt

# Merge with true labels


## Merge data

df_merged_sst2_gpt_base_CoT = df_combined_base_sst2_gpt[['sentence_x', 'label', 'generated_label']].merge(
    df_CoT_sst2_gpt[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gpt_base_CoT


Unnamed: 0,sentence_x,label,generated_label_base,generated_label_one_shot
0,hide new secretions from the parental units,0,0,0
1,"contains no wit , only labored gags",0,0,0
2,that loves its characters and communicates som...,1,1,1
3,remains utterly satisfied to remain the same t...,0,1,1
4,on the worst revenge-of-the-nerds clichés the ...,0,0,0
...,...,...,...,...
938,you wish you were at home watching that movie ...,0,0,0
939,'s no point in extracting the bare bones of by...,0,0,0
940,underdeveloped,0,0,0
941,the jokes are flat,0,0,0


#### Bootstrap

In [32]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gpt_base_CoT['label'].values
y_pred_base = df_merged_sst2_gpt_base_CoT['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gpt_base_CoT['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (CoT - Base): [-0.0197, 0.0134]
➖ Kein signifikanter Unterschied beim F1.


### Gemini

In [66]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_CoT_GEMINI_repaired.csv',sep=';',on_bad_lines='skip')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)
# df['index'] = df['index'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_CoT_sst2_gemini = pd.merge(sst2_subset, df, on='index')
# df_fewShot_sst2_gemini

# Merge with true labels

## Merge data

df_merged_sst2_gemini_base_CoT = df_combined_base_sst2_gemini[['sentence_x', 'label', 'generated_label']].merge(
    df_CoT_sst2_gemini[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gemini_base_CoT


Unnamed: 0,sentence_x,label,generated_label_base,generated_label_one_shot
0,hide new secretions from the parental units,0,0,0
1,"contains no wit , only labored gags",0,0,0
2,that loves its characters and communicates som...,1,1,1
3,remains utterly satisfied to remain the same t...,0,0,0
4,on the worst revenge-of-the-nerds clichés the ...,0,0,0
...,...,...,...,...
819,you wish you were at home watching that movie ...,0,0,0
820,'s no point in extracting the bare bones of by...,0,0,0
821,underdeveloped,0,0,0
822,the jokes are flat,0,0,0


#### Bootstrap

In [77]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gemini_base_CoT['label'].values
y_pred_base = df_merged_sst2_gemini_base_CoT['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gemini_base_CoT['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (CoT - Base): [0.0397, 0.0801]
✅ CoT signifikant besser als Base (F1).


## ZeroShot-CoT

### GPT

In [84]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_ZeroShot_CoT_GPT.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_ZeroShotCoT_sst2_gpt = pd.merge(sst2_subset, df, on='index')
# df_fewShot_sst2_gpt

# Merge with true labels


## Merge data

df_merged_sst2_gpt_base_ZeroShotCoT = df_combined_base_sst2_gpt[['sentence_x', 'label', 'generated_label']].merge(
    df_ZeroShotCoT_sst2_gpt[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gpt_base_ZeroShotCoT


Unnamed: 0,sentence_x,label,generated_label_base,generated_label_one_shot
0,hide new secretions from the parental units,0,0,0
1,"contains no wit , only labored gags",0,0,0
2,that loves its characters and communicates som...,1,1,1
3,remains utterly satisfied to remain the same t...,0,1,1
4,on the worst revenge-of-the-nerds clichés the ...,0,0,0
...,...,...,...,...
983,you wish you were at home watching that movie ...,0,0,0
984,'s no point in extracting the bare bones of by...,0,0,0
985,underdeveloped,0,0,0
986,the jokes are flat,0,0,0


#### Bootstrap

In [89]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gpt_base_ZeroShotCoT['label'].values
y_pred_base = df_merged_sst2_gpt_base_ZeroShotCoT['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gpt_base_ZeroShotCoT['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (ZeroShot-CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ ZeroShot-CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als ZeroShot-CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (ZeroShot-CoT - Base): [-0.0055, 0.0233]
➖ Kein signifikanter Unterschied beim F1.


### Gemini

In [92]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_ZeroShot_CoT_GEMINI.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)
# df['index'] = df['index'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_ZeroShotCoT_sst2_gemini = pd.merge(sst2_subset, df, on='index')
# df_fewShot_sst2_gemini

# Merge with true labels

## Merge data

df_merged_sst2_gemini_base_ZeroShotCoT = df_combined_base_sst2_gemini[['sentence_x', 'label', 'generated_label']].merge(
    df_ZeroShotCoT_sst2_gemini[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gemini_base_ZeroShotCoT


Unnamed: 0,sentence_x,label,generated_label_base,generated_label_one_shot
0,hide new secretions from the parental units,0,0,0
1,"contains no wit , only labored gags",0,0,0
2,that loves its characters and communicates som...,1,1,1
3,remains utterly satisfied to remain the same t...,0,0,1
4,on the worst revenge-of-the-nerds clichés the ...,0,0,0
...,...,...,...,...
936,you wish you were at home watching that movie ...,0,0,0
937,'s no point in extracting the bare bones of by...,0,0,0
938,underdeveloped,0,0,0
939,the jokes are flat,0,0,0


#### Bootstrap

In [99]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gemini_base_ZeroShotCoT['label'].values
y_pred_base = df_merged_sst2_gemini_base_ZeroShotCoT['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gemini_base_ZeroShotCoT['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (ZeroShot-CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ ZeroShot-CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als ZeroShot-CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (ZeroShot-CoT - Base): [0.0002, 0.0462]
✅ ZeroShot-CoT signifikant besser als Base (F1).


## SelfConsistency

### GPT

In [106]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_SelfConsistency3_1Exemplar_GPT.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_SelfCons_sst2_gpt = pd.merge(sst2_subset, df, on='index')
# df_fewShot_sst2_gpt

# Merge with true labels


## Merge data

df_merged_sst2_gpt_base_SelfCons = df_combined_base_sst2_gpt[['sentence_x', 'label', 'generated_label']].merge(
    df_SelfCons_sst2_gpt[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gpt_base_SelfCons


Unnamed: 0,sentence_x,label,generated_label_base,generated_label_one_shot
0,hide new secretions from the parental units,0,0,0
1,"contains no wit , only labored gags",0,0,0
2,that loves its characters and communicates som...,1,1,1
3,remains utterly satisfied to remain the same t...,0,1,1
4,on the worst revenge-of-the-nerds clichés the ...,0,0,0
...,...,...,...,...
988,you wish you were at home watching that movie ...,0,0,0
989,'s no point in extracting the bare bones of by...,0,0,0
990,underdeveloped,0,0,0
991,the jokes are flat,0,0,0


#### Bootstrap

In [109]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gpt_base_SelfCons['label'].values
y_pred_base = df_merged_sst2_gpt_base_SelfCons['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gpt_base_SelfCons['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ SelfConsistency signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als SelfConsistency (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [-0.1053, -0.0597]
❌ Base signifikant besser als SelfConsistency (F1).


### Gemini

In [112]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SST2_SelfConsistency_GEMINI.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)
# df['index'] = df['index'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_SelfCons_sst2_gemini = pd.merge(sst2_subset, df, on='index')
# df_fewShot_sst2_gemini

# Merge with true labels

## Merge data

df_merged_sst2_gemini_base_SelfCons = df_combined_base_sst2_gemini[['sentence_x', 'label', 'generated_label']].merge(
    df_SelfCons_sst2_gemini[['sentence_x', 'generated_label']],
    on='sentence_x',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gemini_base_SelfCons


Unnamed: 0,sentence_x,label,generated_label_base,generated_label_one_shot
0,hide new secretions from the parental units,0,0,0
1,"contains no wit , only labored gags",0,0,0
2,that loves its characters and communicates som...,1,1,1
3,remains utterly satisfied to remain the same t...,0,0,0
4,on the worst revenge-of-the-nerds clichés the ...,0,0,0
...,...,...,...,...
969,you wish you were at home watching that movie ...,0,0,0
970,'s no point in extracting the bare bones of by...,0,0,0
971,underdeveloped,0,0,0
972,the jokes are flat,0,0,0


#### Bootstrap

In [115]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gemini_base_SelfCons['label'].values
y_pred_base = df_merged_sst2_gemini_base_SelfCons['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gemini_base_SelfCons['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx])
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx])
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ SelfConsistency signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als SelfConsistency (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [0.0278, 0.0698]
✅ SelfConsistency signifikant besser als Base (F1).


# SB10K

### GPT - BASE

In [137]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_BASELINE_GPT.csv',sep=',')
# sst2 gemini
# df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_BASELINE_GEMINI.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_base_sb10k_gpt = pd.merge(sb10k_combined, df, on='index')
df_base_sb10k_gpt


Unnamed: 0.1,index,Text,Sentiment,sentiment_coded,Unnamed: 0,sentence,generated_label
0,0,RT @TheKedosZone : So ein Hearthstone - Key vo...,positive,1,0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1
1,1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",neutral,2,1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2
2,2,Aber wenigstens kommt #Supernatural heute mal ...,neutral,2,2,Aber wenigstens kommt #Supernatural heute mal ...,0
3,3,DARLEHEN - Angebot für Schufa - freie Darlehen...,neutral,2,3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2
4,4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,neutral,2,4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2
...,...,...,...,...,...,...,...
995,995,@sinkingFX Die Liebe unter Verwandten . : ' 3,positive,1,995,@sinkingFX Die Liebe unter Verwandten . : ' 3,1
996,996,Cros Kindermusik braucht keiner .,negative,0,996,Cros Kindermusik braucht keiner .,0
997,997,RT @ZDFsport : Extrem viele Braunschweiger hie...,positive,1,997,RT @ZDFsport : Extrem viele Braunschweiger hie...,1
998,998,Moin Moin ... trotz Regen wünsche ich euch ein...,positive,1,998,Moin Moin ... trotz Regen wünsche ich euch ein...,1


### Gemini BASE

In [156]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_BASELINE_GEMINI.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_base_sb10k_gemini = pd.merge(sb10k_combined, df, on='index')
df_base_sb10k_gemini


Unnamed: 0.1,index,Text,Sentiment,sentiment_coded,Unnamed: 0,sentence,generated_label
0,0,RT @TheKedosZone : So ein Hearthstone - Key vo...,positive,1,0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1
1,1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",neutral,2,1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2
2,2,Aber wenigstens kommt #Supernatural heute mal ...,neutral,2,2,Aber wenigstens kommt #Supernatural heute mal ...,0
3,3,DARLEHEN - Angebot für Schufa - freie Darlehen...,neutral,2,3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2
4,4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,neutral,2,4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,0
...,...,...,...,...,...,...,...
994,995,@sinkingFX Die Liebe unter Verwandten . : ' 3,positive,1,994,@sinkingFX Die Liebe unter Verwandten . : ' 3,2
995,996,Cros Kindermusik braucht keiner .,negative,0,995,Cros Kindermusik braucht keiner .,0
996,997,RT @ZDFsport : Extrem viele Braunschweiger hie...,positive,1,996,RT @ZDFsport : Extrem viele Braunschweiger hie...,1
997,998,Moin Moin ... trotz Regen wünsche ich euch ein...,positive,1,997,Moin Moin ... trotz Regen wünsche ich euch ein...,1


## One Shot SB10k

### GPT

In [143]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_ONE_SHOT_GPT.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_oneshot_sb10k_gpt = pd.merge(sb10k_combined, df, on='index')
#df_oneshot_sb10k_gpt


# Merge with true labels
## Merge data

df_merged_sb10k_gpt_base_OneShot = df_base_sb10k_gpt[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_oneshot_sb10k_gpt[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gpt_base_OneShot




Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,2
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,2,2
...,...,...,...,...
995,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,1,2
996,Cros Kindermusik braucht keiner .,0,0,0
997,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
998,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [152]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sb10k_gpt_base_OneShot['sentiment_coded'].values
y_pred_base = df_merged_sb10k_gpt_base_OneShot['generated_label_base'].values
y_pred_one_shot = df_merged_sb10k_gpt_base_OneShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')  ## bc of multi-class
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ SelfConsistency signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als SelfConsistency (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [0.0288, 0.0684]
✅ SelfConsistency signifikant besser als Base (F1).


### Gemini

In [158]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_ONE_SHOT_GEMINI.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_oneshot_sb10k_gemini = pd.merge(sb10k_combined, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_sst2_gemini_base_OneShot = df_base_sb10k_gemini[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_oneshot_sb10k_gemini[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sst2_gemini_base_OneShot



Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,1
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,0,0
...,...,...,...,...
980,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,2,2
981,Cros Kindermusik braucht keiner .,0,0,0
982,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
983,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,2


#### Bootstrap

In [165]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gemini_base_OneShot['sentiment_coded'].values
y_pred_base = df_merged_sst2_gemini_base_OneShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gemini_base_OneShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx],  average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ OneShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als OneShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [0.0540, 0.1004]
✅ OneShot signifikant besser als Base (F1).


## FewShot

### GPT

In [169]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_FEW_SHOT_GPT.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_fewShot_sb10k_gpt = pd.merge(sb10k_combined, df, on='index')
#df_oneshot_sb10k_gpt


# Merge with true labels
## Merge data

df_merged_sb10k_gpt_base_FewShot = df_base_sb10k_gpt[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_fewShot_sb10k_gpt[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gpt_base_FewShot




Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,2
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,2
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,2,2
...,...,...,...,...
995,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,1,2
996,Cros Kindermusik braucht keiner .,0,0,0
997,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
998,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [174]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sb10k_gpt_base_FewShot['sentiment_coded'].values
y_pred_base = df_merged_sb10k_gpt_base_FewShot['generated_label_base'].values
y_pred_one_shot = df_merged_sb10k_gpt_base_FewShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')  ## bc of multi-class
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ FewShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als FewShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [0.0673, 0.1195]
✅ FewShot signifikant besser als Base (F1).


### Gemini

In [180]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_FEW_SHOT_GEMINI.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_fewShot_sb10k_gemini = pd.merge(sb10k_combined, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_sb10k_gemini_base_FewShot = df_base_sb10k_gemini[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_fewShot_sb10k_gemini[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gemini_base_FewShot



Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,2
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,0,0
...,...,...,...,...
974,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,2,2
975,Cros Kindermusik braucht keiner .,0,0,0
976,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
977,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [184]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sst2_gemini_base_OneShot['sentiment_coded'].values
y_pred_base = df_merged_sst2_gemini_base_OneShot['generated_label_base'].values
y_pred_one_shot = df_merged_sst2_gemini_base_OneShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx],  average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ FewShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als FewShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [0.0549, 0.1005]
✅ FewShot signifikant besser als Base (F1).


## CoT

### GPT

In [191]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_CoT_GPT.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_CoT_sb10k_gpt = pd.merge(sb10k_combined, df, on='index')
#df_oneshot_sb10k_gpt


# Merge with true labels
## Merge data

df_merged_sb10k_gpt_base_CoT = df_base_sb10k_gpt[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_CoT_sb10k_gpt[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gpt_base_CoT




Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,1
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,2,2
...,...,...,...,...
994,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,1,2
995,Cros Kindermusik braucht keiner .,0,0,0
996,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
997,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [193]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sb10k_gpt_base_CoT['sentiment_coded'].values
y_pred_base = df_merged_sb10k_gpt_base_CoT['generated_label_base'].values
y_pred_one_shot = df_merged_sb10k_gpt_base_CoT['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')  ## bc of multi-class
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (CoT - Base): [-0.0267, 0.0239]
➖ Kein signifikanter Unterschied beim F1.


### Gemini

In [198]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_CoT_GEMINI.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_CoT_sb10k_gemini = pd.merge(sb10k_combined, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_sb10k_gemini_base_CoT = df_base_sb10k_gemini[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_CoT_sb10k_gemini[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gemini_base_CoT



Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,1
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,0,0
...,...,...,...,...
959,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,2,2
960,Cros Kindermusik braucht keiner .,0,0,0
961,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
962,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [200]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sb10k_gemini_base_CoT['sentiment_coded'].values
y_pred_base = df_merged_sb10k_gemini_base_CoT['generated_label_base'].values
y_pred_one_shot = df_merged_sb10k_gemini_base_CoT['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx],  average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (CoT - Base): [-0.0082, 0.0483]
➖ Kein signifikanter Unterschied beim F1.


## ZeroShot-CoT

### GPT

In [204]:
# sb10k gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_ZeroShot_CoT_GPT.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_ZSCoT_sb10k_gpt = pd.merge(sb10k_combined, df, on='index')
#df_oneshot_sb10k_gpt


# Merge with true labels
## Merge data

df_merged_sb10k_gpt_base_ZSCoT = df_base_sb10k_gpt[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_ZSCoT_sb10k_gpt[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gpt_base_ZSCoT




Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,1
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,2,1
...,...,...,...,...
995,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,1,1
996,Cros Kindermusik braucht keiner .,0,0,0
997,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
998,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [207]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sb10k_gpt_base_ZSCoT['sentiment_coded'].values
y_pred_base = df_merged_sb10k_gpt_base_ZSCoT['generated_label_base'].values
y_pred_one_shot = df_merged_sb10k_gpt_base_ZSCoT['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')  ## bc of multi-class
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (ZeroShot-CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ ZeroShot-CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als ZeroShot-CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (ZeroShot-CoT - Base): [-0.1895, -0.1211]
❌ Base signifikant besser als ZeroShot-CoT (F1).


### Gemini

In [212]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_ZeroShot_CoT_GEMINI.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_ZSCoT_sb10k_gemini = pd.merge(sb10k_combined, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_sb10k_gemini_base_ZSCoT = df_base_sb10k_gemini[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_ZSCoT_sb10k_gemini[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gemini_base_ZSCoT



Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,1
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,0,0
...,...,...,...,...
993,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,2,2
994,Cros Kindermusik braucht keiner .,0,0,0
995,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
996,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [215]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sb10k_gemini_base_ZSCoT['sentiment_coded'].values
y_pred_base = df_merged_sb10k_gemini_base_ZSCoT['generated_label_base'].values
y_pred_one_shot = df_merged_sb10k_gemini_base_ZSCoT['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx],  average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (ZeroShot CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ ZeroShot CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als ZeroShot CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (ZeroShot CoT - Base): [-0.0403, 0.0145]
➖ Kein signifikanter Unterschied beim F1.


## SelfConsistency

### GPT

In [221]:
# sb10k gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_SelfConsistency2_GPT.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_SC_sb10k_gpt = pd.merge(sb10k_combined, df, on='index')
#df_oneshot_sb10k_gpt


# Merge with true labels
## Merge data

df_merged_sb10k_gpt_base_SC = df_base_sb10k_gpt[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_SC_sb10k_gpt[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gpt_base_SC




Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,1
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,2,1
...,...,...,...,...
994,@sinkingFX Die Liebe unter Verwandten . : ' 3,1,1,1
995,Cros Kindermusik braucht keiner .,0,0,0
996,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
997,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [228]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sb10k_gpt_base_SC['sentiment_coded'].values
y_pred_base = df_merged_sb10k_gpt_base_SC['generated_label_base'].values
y_pred_one_shot = df_merged_sb10k_gpt_base_SC['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')  ## bc of multi-class
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ SelfConsistency signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als SelfConsistency (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [-0.0114, 0.0373]
➖ Kein signifikanter Unterschied beim F1.


### GEMINI

In [233]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/SC_SB10k_SelfConsistency_GEMINI.csv',sep=',')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_SC_sb10k_gemini = pd.merge(sb10k_combined, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_sb10k_gemini_base_SC = df_base_sb10k_gemini[['sentence', 'sentiment_coded', 'generated_label']].merge(
    df_SC_sb10k_gemini[['sentence', 'generated_label']],
    on='sentence',
    suffixes=('_base', '_one_shot')
)
df_merged_sb10k_gemini_base_SC



Unnamed: 0,sentence,sentiment_coded,generated_label_base,generated_label_one_shot
0,RT @TheKedosZone : So ein Hearthstone - Key vo...,1,1,1
1,"Tainted Talents ( Ateliertagebuch. ) "" Wir sin...",2,2,2
2,Aber wenigstens kommt #Supernatural heute mal ...,2,0,0
3,DARLEHEN - Angebot für Schufa - freie Darlehen...,2,2,2
4,ANRUF ERWÜNSCHT : Hardcore Teeny Vicky Carrera...,2,0,0
...,...,...,...,...
949,@xilovemichelle_ Kutmichelle =( ..,0,0,0
950,Cros Kindermusik braucht keiner .,0,0,0
951,RT @ZDFsport : Extrem viele Braunschweiger hie...,1,1,1
952,Moin Moin ... trotz Regen wünsche ich euch ein...,1,1,1


#### Bootstrap

In [238]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_sb10k_gemini_base_SC['sentiment_coded'].values
y_pred_base = df_merged_sb10k_gemini_base_SC['generated_label_base'].values
y_pred_one_shot = df_merged_sb10k_gemini_base_SC['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx],  average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ SelfConsistency signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als SelfConsistency (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (SelfConsistency - Base): [0.0201, 0.0736]
✅ SelfConsistency signifikant besser als Base (F1).


# ABSA 

In [42]:
absa_final_subset = absa_final[:1001]


## GPT BASE

In [50]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_SemEval2014_BASELINE_GPT.csv')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_base_absa_gpt = pd.merge(absa_final_subset, df, on='index')
df_base_absa_gpt

Unnamed: 0.1,index,sentence_x,aspect_x,sentiment_coded,polarity,Unnamed: 0,sentence_y,aspect_y,generated_label
0,0,I charge it at night and skip taking the cord ...,cord,2,neutral,0,I charge it at night and skip taking the cord ...,cord,1
1,1,I charge it at night and skip taking the cord ...,battery life,1,positive,1,I charge it at night and skip taking the cord ...,battery life,1
2,2,The tech guy then said the service center does...,service center,0,negative,2,The tech guy then said the service center does...,service center,0
3,3,The tech guy then said the service center does...,"""sales"" team",0,negative,3,The tech guy then said the service center does...,"""sales"" team",2
4,4,The tech guy then said the service center does...,tech guy,2,neutral,4,The tech guy then said the service center does...,tech guy,0
...,...,...,...,...,...,...,...,...,...
996,996,iPhotos is an excellent program for storing an...,program,1,positive,996,iPhotos is an excellent program for storing an...,program,1
997,997,Other than that its a great performing machine...,performing,1,positive,997,Other than that its a great performing machine...,performing,1
998,998,"Called tech support and got the usual Acer ""We...",software,0,negative,998,"Called tech support and got the usual Acer ""We...",software,0
999,999,"Called tech support and got the usual Acer ""We...",software,0,negative,999,"Called tech support and got the usual Acer ""We...",software,0


## GEMINI BASE

In [48]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_BASELINE_GEMINI.csv')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

df_base_absa_gemini = pd.merge(absa_final_subset, df, on='index')
df_base_absa_gemini




Unnamed: 0.1,index,sentence_x,aspect_x,sentiment_coded,polarity,Unnamed: 0,sentence_y,aspect_y,generated_label
0,0,I charge it at night and skip taking the cord ...,cord,2,neutral,0,I charge it at night and skip taking the cord ...,cord,2
1,1,I charge it at night and skip taking the cord ...,battery life,1,positive,1,I charge it at night and skip taking the cord ...,battery life,1
2,2,The tech guy then said the service center does...,service center,0,negative,2,The tech guy then said the service center does...,service center,0
3,3,The tech guy then said the service center does...,"""sales"" team",0,negative,3,The tech guy then said the service center does...,"""sales"" team",0
4,4,The tech guy then said the service center does...,tech guy,2,neutral,4,The tech guy then said the service center does...,tech guy,0
...,...,...,...,...,...,...,...,...,...
996,996,iPhotos is an excellent program for storing an...,program,1,positive,996,iPhotos is an excellent program for storing an...,program,1
997,997,Other than that its a great performing machine...,performing,1,positive,997,Other than that its a great performing machine...,performing,1
998,998,"Called tech support and got the usual Acer ""We...",software,0,negative,998,"Called tech support and got the usual Acer ""We...",software,0
999,999,"Called tech support and got the usual Acer ""We...",software,0,negative,999,"Called tech support and got the usual Acer ""We...",software,0


## OneShot

### GPT

In [289]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_SemEval2014_OneShot_GPT.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_oneshot_absa_gpt = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_absa_gpt




# Merge with true labels
## Merge data

df_merged_absa_gpt_base_OneShot = df_base_absa_gpt[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_oneshot_absa_gpt[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gpt_base_OneShot





Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,1,2
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,"""sales"" team",0,2,2
4,The tech guy then said the service center does...,tech guy,2,0,0
...,...,...,...,...,...
1040,"Called tech support and got the usual Acer ""We...",software,0,0,0
1041,"Called tech support and got the usual Acer ""We...",software,0,0,0
1042,"Called tech support and got the usual Acer ""We...",software,0,0,0
1043,"Called tech support and got the usual Acer ""We...",software,0,0,0


#### Bootstrap

In [284]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_absa_gpt_base_OneShot['sentiment_coded'].values
y_pred_base = df_merged_absa_gpt_base_OneShot['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gpt_base_OneShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')  ## bc of multi-class
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ OneShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als OneShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [0.0003, 0.0267]
✅ OneShot signifikant besser als Base (F1).


## GEMINI

In [291]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_OneShot_GEMINI.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_oneShot_absa_gemini = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_absa_gemini_base_OneShot = df_base_absa_gemini[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_oneShot_absa_gemini[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gemini_base_OneShot



Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,2,2
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,"""sales"" team",0,0,0
4,The tech guy then said the service center does...,tech guy,2,0,0
...,...,...,...,...,...
1040,"Called tech support and got the usual Acer ""We...",software,0,0,0
1041,"Called tech support and got the usual Acer ""We...",software,0,0,0
1042,"Called tech support and got the usual Acer ""We...",software,0,0,0
1043,"Called tech support and got the usual Acer ""We...",software,0,0,0


#### Bootstrap

In [296]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_absa_gemini_base_OneShot['sentiment_coded'].values
y_pred_base = df_merged_absa_gemini_base_OneShot['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gemini_base_OneShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx],  average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ OneShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als OneShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0143, 0.0169]
➖ Kein signifikanter Unterschied beim F1.


## FewShot

### GPT

In [304]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_SemEval2014_FewShot_GPT.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_fewshot_absa_gpt = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_absa_gpt




# Merge with true labels
## Merge data

df_merged_absa_gpt_base_FewShot = df_base_absa_gpt[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_fewshot_absa_gpt[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gpt_base_FewShot





Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,1,2
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,"""sales"" team",0,2,2
4,The tech guy then said the service center does...,tech guy,2,0,2
...,...,...,...,...,...
1040,"Called tech support and got the usual Acer ""We...",software,0,0,0
1041,"Called tech support and got the usual Acer ""We...",software,0,0,0
1042,"Called tech support and got the usual Acer ""We...",software,0,0,0
1043,"Called tech support and got the usual Acer ""We...",software,0,0,0


#### Bootrstrap

In [311]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_absa_gpt_base_FewShot['sentiment_coded'].values
y_pred_base = df_merged_absa_gpt_base_FewShot['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gpt_base_FewShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')  ## bc of multi-class
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ FewShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als FewShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [0.0053, 0.0364]
✅ FewShot signifikant besser als Base (F1).


## GEMINI

In [316]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_FewShot_GEMINI.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_fewShot_absa_gemini = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_absa_gemini_base_FewShot = df_base_absa_gemini[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_fewShot_absa_gemini[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gemini_base_FewShot



Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,2,2
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,"""sales"" team",0,0,0
4,The tech guy then said the service center does...,tech guy,2,0,0
...,...,...,...,...,...
1040,"Called tech support and got the usual Acer ""We...",software,0,0,0
1041,"Called tech support and got the usual Acer ""We...",software,0,0,0
1042,"Called tech support and got the usual Acer ""We...",software,0,0,0
1043,"Called tech support and got the usual Acer ""We...",software,0,0,0


#### Bootstrap

In [319]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_absa_gemini_base_FewShot['sentiment_coded'].values
y_pred_base = df_merged_absa_gemini_base_FewShot['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gemini_base_FewShot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx],  average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ FewShot signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als FewShot (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (FewShot - Base): [-0.0043, 0.0335]
➖ Kein signifikanter Unterschied beim F1.


## CoT

### GPT

In [326]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_SemEval2014_CoT_GPT.csv',sep=';')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)


# Beide DataFrames anhand der 'index'-Spalte mergen
df_cot_absa_gpt = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_absa_gpt


# Merge with true labels
## Merge data

df_merged_absa_gpt_base_cot = df_base_absa_gpt[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_cot_absa_gpt[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gpt_base_cot





Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,1,1
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,tech guy,2,0,0
4,"it is of high quality, has a killer GUI, is ex...",quality,1,1,1
...,...,...,...,...,...
999,"Called tech support and got the usual Acer ""We...",software,0,0,0
1000,"Called tech support and got the usual Acer ""We...",software,0,0,0
1001,"Called tech support and got the usual Acer ""We...",software,0,0,0
1002,"Called tech support and got the usual Acer ""We...",software,0,0,0


#### Bootstrap

In [331]:

# Schritt 1: True Labels und Vorhersagen extrahieren
y_true = df_merged_absa_gpt_base_cot['sentiment_coded'].values
y_pred_base = df_merged_absa_gpt_base_cot['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gpt_base_cot['generated_label_one_shot'].values

# Sicherstellen, dass alles integer ist
y_true = y_true.astype(int)
y_pred_base = y_pred_base.astype(int)
y_pred_one_shot = y_pred_one_shot.astype(int)

# Bootstrap-Parameter
n_iterations = 1000
n_size = len(y_true)
f1_diffs = []

# Schritt 2: Bootstrap-Schleife
for _ in range(n_iterations):
    idx = np.random.choice(range(n_size), size=n_size, replace=True)
    f1_base = f1_score(y_true[idx], y_pred_base[idx], average='weighted')  ## bc of multi-class
    f1_one_shot = f1_score(y_true[idx], y_pred_one_shot[idx], average='weighted')
    f1_diffs.append(f1_one_shot - f1_base)

# Schritt 3: Konfidenzintervall berechnen
ci_lower = np.percentile(f1_diffs, 2.5)
ci_upper = np.percentile(f1_diffs, 97.5)

print(f"95%-Konfidenzintervall der F1-Differenz (CoT - Base): [{ci_lower:.4f}, {ci_upper:.4f}]")

# Schritt 4: Interpretation
if ci_lower > 0:
    print("✅ CoT signifikant besser als Base (F1).")
elif ci_upper < 0:
    print("❌ Base signifikant besser als CoT (F1).")
else:
    print("➖ Kein signifikanter Unterschied beim F1.")

95%-Konfidenzintervall der F1-Differenz (CoT - Base): [-0.0220, 0.0111]
➖ Kein signifikanter Unterschied beim F1.


### Gemini

In [52]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_CoT_GEMINI_repaired.csv',sep=';')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_cot_absa_gemini = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_absa_gemini_base_CoT = df_base_absa_gemini[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_cot_absa_gemini[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gemini_base_CoT



Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,2,1
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,"""sales"" team",0,0,2
4,The tech guy then said the service center does...,tech guy,2,0,0
...,...,...,...,...,...
1025,"Called tech support and got the usual Acer ""We...",software,0,0,0
1026,"Called tech support and got the usual Acer ""We...",software,0,0,0
1027,"Called tech support and got the usual Acer ""We...",software,0,0,0
1028,"Called tech support and got the usual Acer ""We...",software,0,0,0


#### Bootstrap

In [65]:
y_true = df_merged_absa_gemini_base_CoT['sentiment_coded'].values
y_pred_base = df_merged_absa_gemini_base_CoT['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gemini_base_CoT['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)


95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0055, 0.0352]
➖ Kein signifikanter Unterschied beim F1.


## Zero Shot CoT

### GPT

In [76]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_SemEval2014_ZeroCoT_GPT.csv', sep=';')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_ZSCoT_absa_gpt = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_absa_gpt



# Merge with true labels
## Merge data

df_merged_absa_gpt_base_ZSCoT = df_base_absa_gpt[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_ZSCoT_absa_gpt[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gpt_base_ZSCoT





Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,1,1
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,tech guy,2,0,0
4,"it is of high quality, has a killer GUI, is ex...",quality,1,1,1
...,...,...,...,...,...
968,"Called tech support and got the usual Acer ""We...",software,0,0,0
969,"Called tech support and got the usual Acer ""We...",software,0,0,0
970,"Called tech support and got the usual Acer ""We...",software,0,0,0
971,"Called tech support and got the usual Acer ""We...",software,0,0,0


In [78]:
#### Bootstrap 

In [80]:
y_true = df_merged_absa_gpt_base_ZSCoT['sentiment_coded'].values
y_pred_base = df_merged_absa_gpt_base_ZSCoT['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gpt_base_ZSCoT['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0460, -0.0051]
❌ Base signifikant besser als OneShot (F1).


### Gemini

In [86]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_ZeroCoT_GEMINI.csv', sep = ';')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_ZScot_absa_gemini = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_absa_gemini_base_ZSCoT = df_base_absa_gemini[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_ZScot_absa_gemini[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gemini_base_ZSCoT



Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,2,2
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,"""sales"" team",0,0,2
4,The tech guy then said the service center does...,tech guy,2,0,0
...,...,...,...,...,...
1030,"Called tech support and got the usual Acer ""We...",software,0,0,0
1031,"Called tech support and got the usual Acer ""We...",software,0,0,0
1032,"Called tech support and got the usual Acer ""We...",software,0,0,0
1033,"Called tech support and got the usual Acer ""We...",software,0,0,0


In [88]:
#### Bootstrap

In [90]:
y_true = df_merged_absa_gemini_base_ZSCoT['sentiment_coded'].values
y_pred_base = df_merged_absa_gemini_base_ZSCoT['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gemini_base_ZSCoT['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0422, -0.0025]
❌ Base signifikant besser als OneShot (F1).


## Self-Consistency

### GPT

In [98]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_SemEval2014_SelfConsistency_GPT.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_SC_absa_gpt = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_absa_gpt



# Merge with true labels
## Merge data

df_merged_absa_gpt_base_SC = df_base_absa_gpt[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_SC_absa_gpt[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gpt_base_SC





Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,1,1
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,"""sales"" team",0,2,2
4,The tech guy then said the service center does...,tech guy,2,0,0
...,...,...,...,...,...
1040,"Called tech support and got the usual Acer ""We...",software,0,0,0
1041,"Called tech support and got the usual Acer ""We...",software,0,0,0
1042,"Called tech support and got the usual Acer ""We...",software,0,0,0
1043,"Called tech support and got the usual Acer ""We...",software,0,0,0


In [100]:
y_true = df_merged_absa_gpt_base_SC['sentiment_coded'].values
y_pred_base = df_merged_absa_gpt_base_SC['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gpt_base_SC['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0254, 0.0066]
➖ Kein signifikanter Unterschied beim F1.


### Gemini

In [104]:

# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/ABSA_SelfConsistency_GEMINI.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_SC_absa_gemini = pd.merge(absa_final_subset, df, on='index')
# df_oneshot_sb10k_gemini


# Merge with baseline
## Merge data

df_merged_absa_gemini_base_SC = df_base_absa_gemini[['sentence_x', 'aspect_x', 'sentiment_coded', 'generated_label']].merge(
    df_SC_absa_gemini[['sentence_x', 'aspect_x', 'generated_label']],
    on=['sentence_x', 'aspect_x'],
    suffixes=('_base', '_one_shot')
)
df_merged_absa_gemini_base_SC



Unnamed: 0,sentence_x,aspect_x,sentiment_coded,generated_label_base,generated_label_one_shot
0,I charge it at night and skip taking the cord ...,cord,2,2,1
1,I charge it at night and skip taking the cord ...,battery life,1,1,1
2,The tech guy then said the service center does...,service center,0,0,0
3,The tech guy then said the service center does...,"""sales"" team",0,0,2
4,The tech guy then said the service center does...,tech guy,2,0,0
...,...,...,...,...,...
1011,"Called tech support and got the usual Acer ""We...",software,0,0,0
1012,"Called tech support and got the usual Acer ""We...",software,0,0,0
1013,"Called tech support and got the usual Acer ""We...",software,0,0,0
1014,"Called tech support and got the usual Acer ""We...",software,0,0,0


In [106]:
y_true = df_merged_absa_gemini_base_SC['sentiment_coded'].values
y_pred_base = df_merged_absa_gemini_base_SC['generated_label_base'].values
y_pred_one_shot = df_merged_absa_gemini_base_SC['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0039, 0.0373]
➖ Kein signifikanter Unterschied beim F1.


# MAST

In [111]:
irony_subset = irony_final[:1001]
irony_subset

Unnamed: 0,index,Label,text
0,0,1,Sweet United Nations video. Just in time for C...
1,1,1,@mrdahl87 We are rumored to have talked to Erv...
2,2,1,Hey there! Nice to see you Minnesota/ND Winter...
3,3,0,3 episodes left I'm dying over here
4,4,1,I can't breathe! was chosen as the most notabl...
...,...,...,...
996,996,0,It was a greaaaat night! :white_smiling_face:️...
997,997,0,@StephHammy I wonder if they have that in an a...
998,998,0,Queens Of The Stone Age no Rock In Rio http://...
999,999,1,@jordyn_eaton wants me to get hit by a bus so ...


### GPT Base

In [121]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_SemEval2018_BASELINE_GPT.csv',sep=';')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_base_irony_gpt = pd.merge(irony_subset, df, on='index')
df_base_irony_gpt




Unnamed: 0.1,index,Label,text,Unnamed: 0,sentence,generated_label
0,0,1,Sweet United Nations video. Just in time for C...,0,Sweet United Nations video. Just in time for C...,0
1,2,1,Hey there! Nice to see you Minnesota/ND Winter...,2,Hey there! Nice to see you Minnesota/ND Winter...,1
2,3,0,3 episodes left I'm dying over here,3,3 episodes left I'm dying over here,1
3,4,1,I can't breathe! was chosen as the most notabl...,4,I can't breathe! was chosen as the most notabl...,0
4,5,0,You're never too old for Footie Pajamas. http:...,5,You're never too old for Footie Pajamas. http:...,0
...,...,...,...,...,...,...
980,996,0,It was a greaaaat night! :white_smiling_face:️...,996,It was a greaaaat night! :white_smiling_face:️...,1
981,997,0,@StephHammy I wonder if they have that in an a...,997,@StephHammy I wonder if they have that in an a...,0
982,998,0,Queens Of The Stone Age no Rock In Rio http://...,998,Queens Of The Stone Age no Rock In Rio http://...,0
983,999,1,@jordyn_eaton wants me to get hit by a bus so ...,999,@jordyn_eaton wants me to get hit by a bus so ...,1


## Gemini Base

In [127]:

# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_BASELINE_GEMINI.csv',sep=',')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_base_irony_gemini = pd.merge(irony_subset, df, on='index')
df_base_irony_gemini




Unnamed: 0.1,index,Label,text,Unnamed: 0,sentence,generated_label
0,0,1,Sweet United Nations video. Just in time for C...,0,Sweet United Nations video. Just in time for C...,1
1,1,1,@mrdahl87 We are rumored to have talked to Erv...,1,@mrdahl87 We are rumored to have talked to Erv...,1
2,2,1,Hey there! Nice to see you Minnesota/ND Winter...,2,Hey there! Nice to see you Minnesota/ND Winter...,1
3,3,0,3 episodes left I'm dying over here,3,3 episodes left I'm dying over here,1
4,4,1,I can't breathe! was chosen as the most notabl...,4,I can't breathe! was chosen as the most notabl...,1
...,...,...,...,...,...,...
995,996,0,It was a greaaaat night! :white_smiling_face:️...,995,It was a greaaaat night! :white_smiling_face:️...,1
996,997,0,@StephHammy I wonder if they have that in an a...,996,@StephHammy I wonder if they have that in an a...,1
997,998,0,Queens Of The Stone Age no Rock In Rio http://...,997,Queens Of The Stone Age no Rock In Rio http://...,1
998,999,1,@jordyn_eaton wants me to get hit by a bus so ...,998,@jordyn_eaton wants me to get hit by a bus so ...,1


## One Shot

### GPT

In [143]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_SemEval2018_OneShot_GPT4o.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_oneShot_irony_gpt = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_irony_gpt_base_oneshot = df_base_irony_gpt[['sentence', 'Label', 'generated_label']].merge(
    df_oneShot_irony_gpt[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_irony_gpt_base_oneshot




Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,0,1
1,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
2,3 episodes left I'm dying over here,0,1,0
3,I can't breathe! was chosen as the most notabl...,1,0,0
4,You're never too old for Footie Pajamas. http:...,0,0,0
...,...,...,...,...
980,It was a greaaaat night! :white_smiling_face:️...,0,1,1
981,@StephHammy I wonder if they have that in an a...,0,0,1
982,Queens Of The Stone Age no Rock In Rio http://...,0,0,0
983,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


#### Bootstrap

In [146]:
y_true = df_merged_irony_gpt_base_oneshot['Label'].values
y_pred_base = df_merged_irony_gpt_base_oneshot['generated_label_base'].values
y_pred_one_shot = df_merged_irony_gpt_base_oneshot['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0331, 0.0082]
➖ Kein signifikanter Unterschied beim F1.


### Gemini

In [149]:
# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_OneShot_GEMINI.csv')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_oneShot_irony_gemini = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_irony_gemini_base_oneshot = df_base_irony_gemini[['sentence', 'Label', 'generated_label']].merge(
    df_oneShot_irony_gemini[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_irony_gemini_base_oneshot




Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,1,1
1,@mrdahl87 We are rumored to have talked to Erv...,1,1,1
2,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
3,3 episodes left I'm dying over here,0,1,1
4,I can't breathe! was chosen as the most notabl...,1,1,1
...,...,...,...,...
995,It was a greaaaat night! :white_smiling_face:️...,0,1,1
996,@StephHammy I wonder if they have that in an a...,0,1,0
997,Queens Of The Stone Age no Rock In Rio http://...,0,1,0
998,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


#### Bootstrap

In [152]:
y_true = df_merged_irony_gemini_base_oneshot['Label'].values
y_pred_base = df_merged_irony_gemini_base_oneshot['generated_label_base'].values
y_pred_one_shot = df_merged_irony_gemini_base_oneshot['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [0.1318, 0.1931]
✅ OneShot signifikant besser als Base (F1).


## FewShot

### GPT

In [157]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_SemEval2018_FewShot_GPT4o.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_fewShot_irony_gpt = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_irony_gpt_base_fewshot = df_base_irony_gpt[['sentence', 'Label', 'generated_label']].merge(
    df_fewShot_irony_gpt[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_irony_gpt_base_fewshot


Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,0,1
1,Hey there! Nice to see you Minnesota/ND Winter...,1,1,0
2,3 episodes left I'm dying over here,0,1,0
3,I can't breathe! was chosen as the most notabl...,1,0,0
4,You're never too old for Footie Pajamas. http:...,0,0,0
...,...,...,...,...
980,It was a greaaaat night! :white_smiling_face:️...,0,1,0
981,@StephHammy I wonder if they have that in an a...,0,0,1
982,Queens Of The Stone Age no Rock In Rio http://...,0,0,0
983,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


In [159]:
y_true = df_merged_irony_gpt_base_fewshot['Label'].values
y_pred_base = df_merged_irony_gpt_base_fewshot['generated_label_base'].values
y_pred_one_shot = df_merged_irony_gpt_base_fewshot['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.0024, 0.0404]
➖ Kein signifikanter Unterschied beim F1.


### Gemini

In [163]:
# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_FewShot_GEMINI.csv')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_fewShot_irony_gemini = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_irony_gemini_base_fewshot = df_base_irony_gemini[['sentence', 'Label', 'generated_label']].merge(
    df_fewShot_irony_gemini[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_irony_gemini_base_fewshot




Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,1,1
1,@mrdahl87 We are rumored to have talked to Erv...,1,1,1
2,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
3,3 episodes left I'm dying over here,0,1,1
4,I can't breathe! was chosen as the most notabl...,1,1,1
...,...,...,...,...
995,It was a greaaaat night! :white_smiling_face:️...,0,1,1
996,@StephHammy I wonder if they have that in an a...,0,1,0
997,Queens Of The Stone Age no Rock In Rio http://...,0,1,0
998,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


In [165]:
y_true = df_merged_irony_gemini_base_fewshot['Label'].values
y_pred_base = df_merged_irony_gemini_base_fewshot['generated_label_base'].values
y_pred_one_shot = df_merged_irony_gemini_base_fewshot['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [0.1248, 0.1877]
✅ OneShot signifikant besser als Base (F1).


## CoT

### GPT

In [176]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_COT_GPT.csv',sep=';')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_cot_irony_gpt = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_irony_gpt_base_CoT = df_base_irony_gpt[['sentence', 'Label', 'generated_label']].merge(
    df_cot_irony_gpt[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_irony_gpt_base_CoT


Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,0,1
1,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
2,3 episodes left I'm dying over here,0,1,1
3,I can't breathe! was chosen as the most notabl...,1,0,0
4,You're never too old for Footie Pajamas. http:...,0,0,1
...,...,...,...,...
965,It was a greaaaat night! :white_smiling_face:️...,0,1,1
966,@StephHammy I wonder if they have that in an a...,0,0,0
967,Queens Of The Stone Age no Rock In Rio http://...,0,0,0
968,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


In [178]:
y_true = df_merged_irony_gpt_base_CoT['Label'].values
y_pred_base = df_merged_irony_gpt_base_CoT['generated_label_base'].values
y_pred_one_shot = df_merged_irony_gpt_base_CoT['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.1091, -0.0531]
❌ Base signifikant besser als OneShot (F1).


### Gemini

In [188]:
# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_CoT_GEMINI_repaired.csv',sep=';')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_fCoT_irony_gemini = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_CoT_gemini_base_fewshot = df_base_irony_gemini[['sentence', 'Label', 'generated_label']].merge(
    df_fCoT_irony_gemini[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_CoT_gemini_base_fewshot




Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,1,1
1,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
2,3 episodes left I'm dying over here,0,1,1
3,You're never too old for Footie Pajamas. http:...,0,1,1
4,Nothing makes me happier then getting on the h...,1,1,1
...,...,...,...,...
960,It was a greaaaat night! :white_smiling_face:️...,0,1,1
961,@StephHammy I wonder if they have that in an a...,0,1,0
962,Queens Of The Stone Age no Rock In Rio http://...,0,1,0
963,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


#### Bootstrap

In [191]:
y_true = df_merged_CoT_gemini_base_fewshot['Label'].values
y_pred_base = df_merged_CoT_gemini_base_fewshot['generated_label_base'].values
y_pred_one_shot = df_merged_CoT_gemini_base_fewshot['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [0.1578, 0.2282]
✅ OneShot signifikant besser als Base (F1).


## ZeroShot CoT

### GPT

In [196]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_ZeroShot_CoT_GPT.csv')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_ZScot_irony_gpt = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_irony_gpt_base_ZSCoT = df_base_irony_gpt[['sentence', 'Label', 'generated_label']].merge(
    df_ZScot_irony_gpt[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_irony_gpt_base_ZSCoT


Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,0,1
1,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
2,3 episodes left I'm dying over here,0,1,1
3,I can't breathe! was chosen as the most notabl...,1,0,0
4,You're never too old for Footie Pajamas. http:...,0,0,1
...,...,...,...,...
980,It was a greaaaat night! :white_smiling_face:️...,0,1,1
981,@StephHammy I wonder if they have that in an a...,0,0,1
982,Queens Of The Stone Age no Rock In Rio http://...,0,0,0
983,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


#### Bootstrap

In [199]:
y_true = df_merged_irony_gpt_base_ZSCoT['Label'].values
y_pred_base = df_merged_irony_gpt_base_ZSCoT['generated_label_base'].values
y_pred_one_shot = df_merged_irony_gpt_base_ZSCoT['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.1504, -0.0868]
❌ Base signifikant besser als OneShot (F1).


## Gemini 

In [207]:


# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_ZeroShot_CoT_GEMINI_repaired.csv',sep=';')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_ZSCoT_irony_gemini = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_ZSCoT_gemini_base_fewshot = df_base_irony_gemini[['sentence', 'Label', 'generated_label']].merge(
    df_ZSCoT_irony_gemini[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_ZSCoT_gemini_base_fewshot






Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,1,1
1,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
2,3 episodes left I'm dying over here,0,1,1
3,I can't breathe! was chosen as the most notabl...,1,1,1
4,You're never too old for Footie Pajamas. http:...,0,1,1
...,...,...,...,...
984,It was a greaaaat night! :white_smiling_face:️...,0,1,1
985,@StephHammy I wonder if they have that in an a...,0,1,1
986,Queens Of The Stone Age no Rock In Rio http://...,0,1,1
987,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


In [209]:
#### Bootstrap

In [211]:
y_true = df_merged_ZSCoT_gemini_base_fewshot['Label'].values
y_pred_base = df_merged_ZSCoT_gemini_base_fewshot['generated_label_base'].values
y_pred_one_shot = df_merged_ZSCoT_gemini_base_fewshot['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [0.0155, 0.0599]
✅ OneShot signifikant besser als Base (F1).


## SelfCon

### GPT

In [216]:
# sst2 gpt
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_SelfConsistency_GPT.csv',sep=';')


df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_SC_irony_gpt = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_irony_gpt_base_SC = df_base_irony_gpt[['sentence', 'Label', 'generated_label']].merge(
    df_SC_irony_gpt[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_irony_gpt_base_SC


Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,0,1
1,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
2,3 episodes left I'm dying over here,0,1,1
3,I can't breathe! was chosen as the most notabl...,1,0,0
4,You're never too old for Footie Pajamas. http:...,0,0,1
...,...,...,...,...
802,What an eventful weekend,1,0,0
803,@MLS @ussoccer Awesome destination site. High ...,1,1,1
804,"Kyle it won't let me @ you? But yeah, we are g...",0,0,1
805,"@Parlett316 that's where you are wrong, I can ...",0,0,0


In [218]:
#### Bootstrap

In [220]:
y_true = df_merged_irony_gpt_base_SC['Label'].values
y_pred_base = df_merged_irony_gpt_base_SC['generated_label_base'].values
y_pred_one_shot = df_merged_irony_gpt_base_SC['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [-0.2014, -0.1265]
❌ Base signifikant besser als OneShot (F1).


## Gemini

In [233]:


# sst2 gemini
df = pd.read_csv('/Users/marvinschmitt/Library/CloudStorage/OneDrive-Persönlich/M.Sc. Data Science/17 Masterarbeit/Repo/Prod/CSVs/MAST_SelfConsistency_GEMINI.csv')

df = df[df['generated_label'].isin(['0', '1',0,1,'2',2])]

df['generated_label'] = df['generated_label'].astype(int)

# Beide DataFrames anhand der 'index'-Spalte mergen
df_SC_irony_gemini = pd.merge(irony_subset, df, on='index')


# Merge with true labels
## Merge data

df_merged_gemini_base_SC = df_base_irony_gemini[['sentence', 'Label', 'generated_label']].merge(
    df_SC_irony_gemini[['sentence', 'generated_label']],
    on=['sentence'],
    suffixes=('_base', '_one_shot')
)
df_merged_gemini_base_SC






Unnamed: 0,sentence,Label,generated_label_base,generated_label_one_shot
0,Sweet United Nations video. Just in time for C...,1,1,1
1,@mrdahl87 We are rumored to have talked to Erv...,1,1,1
2,Hey there! Nice to see you Minnesota/ND Winter...,1,1,1
3,3 episodes left I'm dying over here,0,1,1
4,I can't breathe! was chosen as the most notabl...,1,1,1
...,...,...,...,...
991,It was a greaaaat night! :white_smiling_face:️...,0,1,1
992,@StephHammy I wonder if they have that in an a...,0,1,1
993,Queens Of The Stone Age no Rock In Rio http://...,0,1,1
994,@jordyn_eaton wants me to get hit by a bus so ...,1,1,1


In [237]:
y_true = df_merged_gemini_base_SC['Label'].values
y_pred_base = df_merged_gemini_base_SC['generated_label_base'].values
y_pred_one_shot = df_merged_gemini_base_SC['generated_label_one_shot'].values


calculateBootstrapMulti(y_true, y_pred_base, y_pred_one_shot)

95%-Konfidenzintervall der F1-Differenz (OneShot - Base): [0.0317, 0.0799]
✅ OneShot signifikant besser als Base (F1).
