In [19]:
import sys
sys.path.append('..')
from run_fft import FFTProcessor
import numpy as np
import pandas as pd
import os

In [20]:
class SpectrumData():
    def __init__(self, filename):
        self.filename = filename
        self.spectrum_df = self.read_df()
    
    def read_df(self):
        df = pd.read_csv(self.filename)
        return df
    
    def get_dict(self):
        result = {}
        unique_sids = self.spectrum_df['sid'].unique()
        for sid in unique_sids:
            sid_df = self.spectrum_df[self.spectrum_df['sid'] == sid]
            result[sid] = {
                'freq': sid_df['freq'].values,
                'power': sid_df['power'].values
            }
        return result

In [21]:
def classify_pair(x_human: dict, x_model: dict, k_freq: int = 10, eps = 0.0, higher = 'model'):
    """
    0 for human, 1 for model
    """
    assert x_human.keys() == x_model.keys()
    correct = 0
    for sid in x_human.keys():
        pow_human = x_human[sid]['power']
        pow_model = x_model[sid]['power']
        # If higher_spectrum == 'model'
        # Hypothesis: pow_samp > pow_orig for k_freq freqs, i.e., Human > Model
        if higher == 'model':
            if np.sum(pow_model[:k_freq]) - np.sum(pow_human[:k_freq]) > eps:
                correct += 1
        else:
            if np.sum(pow_model[:k_freq]) - np.sum(pow_human[:k_freq]) < eps:
                correct += 1
    return correct / len(x_human)

def select_k(human: dict, model: dict, higher: str):
    best_k, best_acc = None, 0.0
    for k in range(1, 500):
        acc = classify_pair(human, model, k_freq=k, higher=higher)
        if acc > best_acc:
            best_acc = acc
            best_k = k
    return best_k, best_acc

In [22]:
datasets = ['pubmed', 'writing', 'xsum']
models = ['gpt-4', 'gpt-3.5', 'gpt-3']
labels = ['original', 'sampled']
generated_models = ['bigram', 'gpt2xl', 'mistral']
type = 'dwt'

In [23]:
def classify_pair_with_path(original, sampled, higher='model'):
    spec_orig = SpectrumData(original)
    x_human = spec_orig.get_dict()
    spec_samp = SpectrumData(sampled)
    x_model = spec_samp.get_dict()
    best_k, best_accuracy = select_k(x_human, x_model, higher=higher)
    return best_k, best_accuracy

for dataset in datasets:
    print(f"{dataset}:")
    for model in models:
        print(f"    {model}:")
        for generated_model in generated_models:
            print(f"        {generated_model}:")
            original = f"../data/{dataset}/{dataset}_{model}.original.{generated_model}.nllzs.wavelet_{type}.txt"
            sampled = f"../data/{dataset}/{dataset}_{model}.sampled.{generated_model}.nllzs.wavelet_{type}.txt"
            best_k, best_accuracy = classify_pair_with_path(original, sampled)
            print(f"            {best_k}: {best_accuracy:.4f}")

pubmed:
    gpt-4:
        bigram:
            9: 0.6133
        gpt2xl:
            9: 0.9467
        mistral:
            10: 0.9467
    gpt-3.5:
        bigram:
            6: 0.7467
        gpt2xl:
            8: 0.6133
        mistral:
            1: 0.9667
    gpt-3:
        bigram:
            15: 0.6067
        gpt2xl:
            158: 0.7333
        mistral:
            42: 0.7400
writing:
    gpt-4:
        bigram:
            215: 0.6133
        gpt2xl:
            258: 0.5800
        mistral:
            6: 0.9533
    gpt-3.5:
        bigram:
            220: 0.7267
        gpt2xl:
            255: 0.5867
        mistral:
            1: 0.9867
    gpt-3:
        bigram:
            220: 0.6867
        gpt2xl:
            244: 0.6533
        mistral:
            12: 0.8867
xsum:
    gpt-4:
        bigram:
            217: 0.7533
        gpt2xl:
            222: 0.4800
        mistral:
            8: 0.7733
    gpt-3.5:
        bigram:
            230: 0.7867
        gpt2xl:


In [24]:
genre = 'pubmed'
est_name = 'mistral'

spec_orig = SpectrumData(f'../data/gpt-4/pubmed_gpt-4.original.{est_name}.nllzs.fftnorm.txt')
x_human = spec_orig.get_dict()
# print(x_orig[0])

spec_samp = SpectrumData(f'../data/gpt-4/pubmed_gpt-4.sampled.{est_name}.nllzs.fftnorm.txt')
x_model = spec_samp.get_dict()

acc = classify_pair(x_human, x_model, k_freq=3, eps=0.0)
print(acc)

best_k, best_acc = select_k(x_human, x_model, higher='model')
print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc}')

# pubmed, mistral, k=10, 0.867
# pubmed, mistral, k=3, 0.90
# pubmed, mistral, k=5, 0.887

FileNotFoundError: [Errno 2] No such file or directory: '../data/gpt-4/pubmed_gpt-4.original.mistral.nllzs.fftnorm.txt'

In [10]:
# Eval loop for GPT-4
print('GPT-4:')

for genre in ['pubmed', 'writing', 'xsum']:
    for est_name in ['mistral', 'llama', 'gpt2xl', 'gpt2lg', 'gpt2md', 'gpt2']:
        orig_filename = f'../data/gpt-4/{genre}_gpt-4.original.{est_name}.nllzs.fftnorm.txt'
        samp_filename = f'../data/gpt-4/{genre}_gpt-4.sampled.{est_name}.nllzs.fftnorm.txt'
        if not os.path.exists(orig_filename) or not os.path.exists(samp_filename):
            continue
        spec_orig = SpectrumData(orig_filename)
        x_human = spec_orig.get_dict()
        spec_samp = SpectrumData(samp_filename)
        x_model = spec_samp.get_dict()

        best_k_1, best_acc_1 = select_k(x_human, x_model, higher='human')
        best_k_2, best_acc_2 = select_k(x_human, x_model, higher='model')
        if best_acc_1 > best_acc_2:
            best_k = best_k_1
            best_acc = best_acc_1
            higher = 'human'
        else:
            best_k = best_k_2
            best_acc = best_acc_2
            higher = 'model'
        print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc:.4f}, higher={higher}')

GPT-4:
pubmed, mistral, best_k=3, best_acc=0.9000, higher=model
pubmed, gpt2xl, best_k=3, best_acc=0.9133, higher=model
writing, mistral, best_k=4, best_acc=0.7667, higher=model
writing, gpt2xl, best_k=23, best_acc=0.8467, higher=human
xsum, mistral, best_k=48, best_acc=0.6533, higher=human
xsum, gpt2xl, best_k=29, best_acc=0.8733, higher=human


In [84]:
# Eval loop for GPT-3.5
print('GPT-3.5:')

for genre in ['pubmed', 'writing', 'xsum']:
    for est_name in ['mistral', 'llama', 'gpt2xl', 'gpt2lg', 'gpt2md', 'gpt2']:
        orig_filename = f'../data/gpt-3.5/{genre}_gpt-3.5-turbo.original.{est_name}.nllzs.fftnorm.txt'
        samp_filename = f'../data/gpt-3.5/{genre}_gpt-3.5-turbo.sampled.{est_name}.nllzs.fftnorm.txt'
        if not os.path.exists(orig_filename) or not os.path.exists(samp_filename):
            continue
        spec_orig = SpectrumData(orig_filename)
        x_human = spec_orig.get_dict()
        spec_samp = SpectrumData(samp_filename)
        x_model = spec_samp.get_dict()

        best_k_1, best_acc_1 = select_k(x_human, x_model, higher='human')
        best_k_2, best_acc_2 = select_k(x_human, x_model, higher='model')
        if best_acc_1 > best_acc_2:
            best_k = best_k_1
            best_acc = best_acc_1
            higher = 'human'
        else:
            best_k = best_k_2
            best_acc = best_acc_2
            higher = 'model'
        print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc:.4f}, higher={higher}')

GPT-3.5:
pubmed, mistral, best_k=2, best_acc=0.9467, higher=model
pubmed, gpt2xl, best_k=10, best_acc=0.6200, higher=model
writing, mistral, best_k=3, best_acc=0.9200, higher=model
writing, gpt2xl, best_k=30, best_acc=0.8533, higher=human
xsum, mistral, best_k=4, best_acc=0.9067, higher=model
xsum, gpt2xl, best_k=24, best_acc=0.9200, higher=human


In [87]:
# Eval loop for Davinci
print('GPT-3:')

for genre in ['pubmed', 'writing', 'xsum']:
    for est_name in ['mistral', 'llama', 'gpt2xl', 'gpt2lg', 'gpt2md', 'gpt2']:
        orig_filename = f'../data/davinci/{genre}_davinci.original.{est_name}.nllzs.fftnorm.txt'
        samp_filename = f'../data/davinci/{genre}_davinci.sampled.{est_name}.nllzs.fftnorm.txt'
        if not os.path.exists(orig_filename) or not os.path.exists(samp_filename):
            continue
        spec_orig = SpectrumData(orig_filename)
        x_human = spec_orig.get_dict()
        spec_samp = SpectrumData(samp_filename)
        x_model = spec_samp.get_dict()

        best_k_1, best_acc_1 = select_k(x_human, x_model, higher='human')
        best_k_2, best_acc_2 = select_k(x_human, x_model, higher='model')
        if best_acc_1 > best_acc_2:
            best_k = best_k_1
            best_acc = best_acc_1
            higher = 'human'
        else:
            best_k = best_k_2
            best_acc = best_acc_2
            higher = 'model'
        print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc:.4f}, higher={higher}')

GPT-3:
pubmed, mistral, best_k=5, best_acc=0.6867, higher=model
pubmed, gpt2xl, best_k=10, best_acc=0.6600, higher=model
writing, mistral, best_k=10, best_acc=0.7200, higher=model
writing, gpt2xl, best_k=40, best_acc=0.6000, higher=model
xsum, mistral, best_k=2, best_acc=0.5867, higher=model
xsum, gpt2xl, best_k=3, best_acc=0.6067, higher=model


### Evaluate classification on bigram as estimator

In [12]:
print('bigram GPT-4:')
for genre in ['pubmed', 'writing', 'xsum']:
    for est_name in ['bigram']:
        orig_filename = f'../data/gpt-4/bigram/fftnorm/{genre}_gpt-4.original.{est_name}.fftnorm.txt'
        samp_filename = f'../data/gpt-4/bigram/fftnorm/{genre}_gpt-4.sampled.{est_name}.fftnorm.txt'
        if not os.path.exists(orig_filename) or not os.path.exists(samp_filename):
            continue
        spec_orig = SpectrumData(orig_filename)
        x_human = spec_orig.get_dict()
        spec_samp = SpectrumData(samp_filename)
        x_model = spec_samp.get_dict()

        best_k_1, best_acc_1 = select_k(x_human, x_model, higher='human')
        best_k_2, best_acc_2 = select_k(x_human, x_model, higher='model')
        if best_acc_1 > best_acc_2:
            best_k = best_k_1
            best_acc = best_acc_1
            higher = 'human'
        else:
            best_k = best_k_2
            best_acc = best_acc_2
            higher = 'model'
        print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc:.4f}, higher={higher}')

bigram GPT-4:
pubmed, bigram, best_k=12, best_acc=0.6533, higher=human
writing, bigram, best_k=28, best_acc=0.8800, higher=human
xsum, bigram, best_k=34, best_acc=0.7667, higher=human


In [13]:
print('bigram GPT-3.5:')
for genre in ['pubmed', 'writing', 'xsum']:
    for est_name in ['bigram']:
        orig_filename = f'../data/gpt-3.5/bigram/fftnorm/{genre}_gpt-3.5-turbo.original.{est_name}.fftnorm.txt'
        samp_filename = f'../data/gpt-3.5/bigram/fftnorm/{genre}_gpt-3.5-turbo.sampled.{est_name}.fftnorm.txt'
        if not os.path.exists(orig_filename) or not os.path.exists(samp_filename):
            continue
        spec_orig = SpectrumData(orig_filename)
        x_human = spec_orig.get_dict()
        spec_samp = SpectrumData(samp_filename)
        x_model = spec_samp.get_dict()

        best_k_1, best_acc_1 = select_k(x_human, x_model, higher='human')
        best_k_2, best_acc_2 = select_k(x_human, x_model, higher='model')
        if best_acc_1 > best_acc_2:
            best_k = best_k_1
            best_acc = best_acc_1
            higher = 'human'
        else:
            best_k = best_k_2
            best_acc = best_acc_2
            higher = 'model'
        print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc:.4f}, higher={higher}')

bigram GPT-3.5:
pubmed, bigram, best_k=3, best_acc=0.6267, higher=model
writing, bigram, best_k=30, best_acc=0.9067, higher=human
xsum, bigram, best_k=44, best_acc=0.7800, higher=human


In [14]:
print('bigram GPT-3:')
for genre in ['pubmed', 'writing', 'xsum']:
    for est_name in ['bigram']:
        orig_filename = f'../data/davinci/bigram/fftnorm/{genre}_davinci.original.{est_name}.fftnorm.txt'
        samp_filename = f'../data/davinci/bigram/fftnorm/{genre}_davinci.sampled.{est_name}.fftnorm.txt'
        if not os.path.exists(orig_filename) or not os.path.exists(samp_filename):
            continue
        spec_orig = SpectrumData(orig_filename)
        x_human = spec_orig.get_dict()
        spec_samp = SpectrumData(samp_filename)
        x_model = spec_samp.get_dict()

        best_k_1, best_acc_1 = select_k(x_human, x_model, higher='human')
        best_k_2, best_acc_2 = select_k(x_human, x_model, higher='model')
        if best_acc_1 > best_acc_2:
            best_k = best_k_1
            best_acc = best_acc_1
            higher = 'human'
        else:
            best_k = best_k_2
            best_acc = best_acc_2
            higher = 'model'
        print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc:.4f}, higher={higher}')

bigram GPT-3:
pubmed, bigram, best_k=8, best_acc=0.6733, higher=model
writing, bigram, best_k=8, best_acc=0.5733, higher=human
xsum, bigram, best_k=26, best_acc=0.6400, higher=model


### Classification on chop = 50, 100, 150 data

In [15]:
print('Chop length:')

for genre in ['writing', 'xsum']:
    for chop_k in [50, 100, 150]:
        est_name = 'gpt2xl'
        orig_filename = f'../data/short/{genre}_gpt-4.original.{est_name}.chop{chop_k}.nllzs.fftnorm.txt'
        samp_filename = f'../data/short/{genre}_gpt-4.sampled.{est_name}.chop{chop_k}.nllzs.fftnorm.txt'
        if not os.path.exists(orig_filename) or not os.path.exists(samp_filename):
            continue
        spec_orig = SpectrumData(orig_filename)
        x_human = spec_orig.get_dict()
        spec_samp = SpectrumData(samp_filename)
        x_model = spec_samp.get_dict()

        best_k_1, best_acc_1 = select_k(x_human, x_model, higher='human')
        best_k_2, best_acc_2 = select_k(x_human, x_model, higher='model')
        if best_acc_1 > best_acc_2:
            best_k = best_k_1
            best_acc = best_acc_1
            higher = 'human'
        else:
            best_k = best_k_2
            best_acc = best_acc_2
            higher = 'model'
        print(f'{genre}, chop={chop_k}, best_k={best_k}, best_acc={best_acc:.4f}, higher={higher}')

Chop length:
writing, chop=50, best_k=8, best_acc=0.6800, higher=human
writing, chop=100, best_k=13, best_acc=0.8200, higher=human
writing, chop=150, best_k=19, best_acc=0.8933, higher=human
xsum, chop=50, best_k=5, best_acc=0.6533, higher=human
xsum, chop=100, best_k=16, best_acc=0.7533, higher=human
xsum, chop=150, best_k=21, best_acc=0.7867, higher=human


### Exploring classifier with two thresholds, `k_low`, `k_high`

In [None]:
def classify_pair(x_human: dict, x_model: dict, k_freq: int = 10, eps = 0.0, higher = 'model'):
    """
    0 for human, 1 for model
    """
    assert x_human.keys() == x_model.keys()
    correct = 0
    for sid in x_human.keys():
        pow_human = x_human[sid]['power']
        pow_model = x_model[sid]['power']
        # If higher_spectrum == 'model'
        # Hypothesis: pow_samp > pow_orig for k_freq freqs, i.e., Human > Model
        if higher == 'model':
            if np.sum(pow_model[:k_freq]) - np.sum(pow_human[:k_freq]) > eps:
                correct += 1
        else:
            if np.sum(pow_model[:k_freq]) - np.sum(pow_human[:k_freq]) < eps:
                correct += 1
    return correct / len(x_human)

def select_k(human: dict, model: dict, higher: str):
    best_k, best_acc = None, 0.0
    for k in range(1, 51):
        acc = classify_pair(human, model, k_freq=k, higher=higher)
        if acc > best_acc:
            best_acc = acc
            best_k = k
    return best_k, best_acc