# First Partial Interpretation (1920s - 70s)

Nothing here is final, and we don't yet have the 1980s or 90s at all.

But we can start to see the basic shape of some results.


In [15]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import math
import numpy as np

In [14]:
# Create the dataframe we'll analyze

embedding_dfs = []
topicmodel_dfs = []

for decade in range(20, 80, 10):
    e_df = pd.read_csv('precocity_cosine_19' + str(decade) + 's_docs.tsv', sep = '\t')
    tm_df = pd.read_csv('precocity_kld_19' + str(decade) + 's_docs.tsv', sep = '\t')
    embedding_dfs.append(e_df)
    topicmodel_dfs.append(tm_df)

embeddings = pd.concat(embedding_dfs, axis = 0)
topicmodels = pd.concat(topicmodel_dfs, axis = 0)

multi_index_columns = ['docid', 'fraction_compared', 'filtered', 'time_radius', 'chunks_used']

topicmodels.set_index(multi_index_columns, inplace=True)
embeddings.set_index(multi_index_columns, inplace=True)

# Rename columns in 'embeddings' DataFrame
embeddings.rename(columns={'novelty': 'embed_novelty', 
                           'transience': 'embed_transience', 
                           'precocity': 'embed_precocity'}, inplace=True)

print('Initial stage: ', embeddings.shape, topicmodels.shape)

# Select the columns we want to add from 'embeddings'
columns_to_add = ['embed_novelty', 'embed_transience', 'embed_precocity']

# Create 'data' DataFrame by joining selected columns from 'embeddings' with 'topicmodels'
data = topicmodels.join(embeddings[columns_to_add], how = 'inner')

print('Intermediate stage: ', data.shape)

data.reset_index(level=['fraction_compared', 'filtered', 'time_radius', 'chunks_used'], inplace=True)

meta = pd.read_csv('../metadata/litstudies/LitMetadataWithS2.tsv', sep = '\t')
meta = meta.loc[~pd.isnull(meta.paperId), : ]
meta.set_index('paperId', inplace = True)

data = data.join(meta['new_cite_count'], how='inner')
data['logcitations'] = np.log(data.new_cite_count + 1)

print('Final stage should be the same:', data.shape)
data.head()


Initial stage:  (303984, 5) (303984, 5)
Intermediate stage:  (303984, 8)
Final stage should be the same: (303984, 14)


Unnamed: 0,fraction_compared,filtered,time_radius,chunks_used,date,num_chunks,precocity,novelty,transience,embed_novelty,embed_transience,embed_precocity,new_cite_count,logcitations
000041afdc91612fa3c16a31e6381b1dfcf5b69b,1.0,True,10,1.0,1959,11,-0.093936,5.733958,5.827894,0.210611,0.210563,4.8e-05,1,0.693147
000041afdc91612fa3c16a31e6381b1dfcf5b69b,1.0,True,10,0.25,1959,11,-0.077704,5.808551,5.886255,0.209294,0.208335,0.000958,1,0.693147
000041afdc91612fa3c16a31e6381b1dfcf5b69b,1.0,True,20,1.0,1959,11,-0.147078,5.724905,5.871983,0.211601,0.210379,0.001222,1,0.693147
000041afdc91612fa3c16a31e6381b1dfcf5b69b,1.0,True,20,0.25,1959,11,-0.117383,5.595826,5.713209,0.210309,0.207506,0.002803,1,0.693147
000041afdc91612fa3c16a31e6381b1dfcf5b69b,1.0,False,10,1.0,1959,11,-0.093936,5.733958,5.827894,0.210611,0.210563,4.8e-05,1,0.693147


In [26]:
correlation_topics = []
correlation_embeds = []

fractions_available = [1.0, 0.05]
chunks_available = [1.0, 0.25]

for frac in fractions_available:
    for chunknum in chunks_available:
        df = data.loc[(data.filtered == True) &
                      (data.time_radius == 20) &
                      (data.fraction_compared == frac) &
                      (data.chunks_used == chunknum), :]
        correlation_t = pearsonr(df.logcitations, df.precocity)
        correlation_e = pearsonr(df.logcitations, df.embed_precocity)
        correlation_topics.append(round(correlation_t[0], 3))
        correlation_embeds.append(round(correlation_e[0], 3))


In [27]:
# guide code borrowed from GPT-4!

# Create the DataFrames
df_t = pd.DataFrame({
    'Compare to all': correlation_topics[:2],
    'Most similar 5%': correlation_topics[2:]
}, index=['Average all', 'Average top 25%'])

df_e = pd.DataFrame({
    'Compare to all': correlation_embeds[:2],
    'Most similar 5%': correlation_embeds[2:]
}, index=['Average all', 'Average top 25%'])

# Combine the two DataFrames into a single DataFrame with a multi-level column
combined_df = pd.concat([df_t, df_e], axis=1, keys=['Topic model', 'GTE Embeds'])

combined_df

Unnamed: 0_level_0,Topic model,Topic model,GTE Embeds,GTE Embeds
Unnamed: 0_level_1,Compare to all,Most similar 5%,Compare to all,Most similar 5%
Average all,0.176,0.107,0.139,0.105
Average top 25%,0.199,0.109,0.151,0.116


In [2]:
# Creating a 2x2 DataFrame for each state of Var3
# Here 'A' and 'B' will be the row indices, and 'C' and 'D' will be the column indices

# Define the accuracy statistics for each model configuration
# Assuming the order of the stats is the same as the multi-index provided previously
accuracy_stats_e = [0.95, 0.89, 0.96, 0.88]  # Stats where Var3 is 'E'
accuracy_stats_f = [0.92, 0.87, 0.91, 0.85]  # Stats where Var3 is 'F'

# Create the DataFrames
df_e = pd.DataFrame({
    'C': accuracy_stats_e[:2],
    'D': accuracy_stats_e[2:]
}, index=['A', 'B'])

df_f = pd.DataFrame({
    'C': accuracy_stats_f[:2],
    'D': accuracy_stats_f[2:]
}, index=['A', 'B'])

# Combine the two DataFrames into a single DataFrame with a multi-level column
combined_df = pd.concat([df_e, df_f], axis=1, keys=['Var3=E', 'Var3=F'])

combined_df


Unnamed: 0_level_0,Var3=E,Var3=E,Var3=F,Var3=F
Unnamed: 0_level_1,C,D,C,D
A,0.95,0.96,0.92,0.91
B,0.89,0.88,0.87,0.85
