# Purpose

Tidy OP models results for further analysis.

OP model details (10 runs): 
cleanup: 20, hidden: 100, learning rate: .001, pnoise: 0

Results were pushed to GBQ slow_op_10.train



In [None]:
import pandas as pd
import numpy as np

## Pull data from BQ

In [None]:
from google.cloud import bigquery
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/jupyter/tf/secret/majestic-camp-303620-e8cb3a12037b.json"
client = bigquery.Client(location="US", project="majestic-camp-303620")

def load_raw_data():
    """Read data from BQ database"""
    query = """
    SELECT 
        epoch,
        sample,
        word,
        AVG(wf) AS wf ,  
        AVG(acc) AS acc, 
        AVG(sse) AS sse, 
    FROM 
        slow_op_10.train
    WHERE 
        unit_time=4.0
    GROUP BY
        epoch,
        sample,
        word;
    """
    query_job = client.query(query)

    return query_job.to_dataframe()

# df = load_raw_data()

## Tidying

In [None]:
# better name
df = pd.read_csv("op10_ave_results.csv")
df.rename({'wf':'wf_dynamic'}, axis=1, inplace=True)

# Calculate word length
df['wlen'] = df.word.str.len()

# Get OP measure (unconditional surprisal)
op = pd.read_csv('noam/supplementary_material.csv')
op = op[['word', 'uncond.surprisal']]
op.rename({'uncond.surprisal': 'op'}, axis=1, inplace=True)
df = df.merge(op, how='left', on='word')

# Merge with training set to get WSJ frequency and IMG
df_train = pd.read_csv("../../dataset/df_train.csv")
df_train = df_train[['word', 'wf', 'img']]
df_train.rename({'wf':'wf_wsj'}, axis=1, inplace=True)
df = df.merge(df_train, 'left', 'word')

# df['zipf_wsj'] = np.log10((df.wf_wsj/1000) + 1)  # wrong scale
df['log_wf_wsj'] = np.log10(df.wf_wsj+1)
df['log_wf_dynamic'] = np.log10(df.wf_dynamic+1)

- When converting WSJ to Zipf, Zipf range is 0-3.4, which is a bit off the regular range of 0-7, perhaps WSJ is not a wpm scale in the raw data
- To get Zipf scale, I used a [word_freq](https://github.com/LuminosoInsight/wordfreq/) library that based on [exquisite-corpus](https://github.com/LuminosoInsight/exquisite-corpus), which aggregated corpus from Wikipedia, SUBTLEX, News, Books, Web, Twitter, Reddit, and MISC content

## Get Zipf

In [None]:
import wordfreq

def get_zipf(x):
    return wordfreq.zipf_frequency(str(x), lang='en', minimum=0)

def get_wf(x):
    return wordfreq.word_frequency(str(x), lang='en', minimum=0)

df['zipf_exq'] = df.word.apply(get_zipf)
df['wf_exq'] = df.word.apply(get_wf)



In [None]:
df = df[['epoch', 'sample', 'word', 'wlen', 'wf_wsj', 'wf_dynamic', 'log_wf_wsj', 'log_wf_dynamic', 'wf_exq', 'zipf_exq', 'op', 'img', 'acc', 'sse']]
df.to_csv("parsed_df_210514.csv")