In [None]:
from typing import Dict, Tuple
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joypy
import sys

from develop.utils.paths import DATA_ALT, NOTEBOOKS_ANALYSIS

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)

In [3]:
cols_interest = ['year', 'ref_year' 'index', 'count', 'count_t-1','entropy', 'cosine_similarity', 'cross_entropy', 'kl_divergence']

fixed = "df_fixed_1920-2020.csv"
chain = "df_chain_1920-2020.csv"

df_fixed = pd.read_csv(os.path.join(DATA_ALT, "04_create_metrics", fixed), sep="|").round(4)
df_chain = pd.read_csv(os.path.join(DATA_ALT, "04_create_metrics", chain), sep="|").round(4)

df_fixed.rename(columns={'index':'word'}, inplace=True)
df_chain.rename(columns={'index':'word'}, inplace=True)

In [4]:
df_fixed['freq_pct_inc'] = ((df_fixed['count']-df_fixed['count_t-1']) / df_fixed['count_t-1']).round(4)
df_chain['freq_pct_inc'] = ((df_chain['count']-df_chain['count_t-1']) / df_chain['count_t-1']).round(4)

df_fixed['freq_pct_inc_abs'] = abs((df_fixed['count']-df_fixed['count_t-1']) / df_fixed['count_t-1']).round(4)
df_chain['freq_pct_inc_abs'] = abs((df_chain['count']-df_chain['count_t-1']) / df_chain['count_t-1']).round(4)

## Correlation Analysis
### Fixed Based - Pearson Correlation

In [5]:
df_fixed[['cosine_similarity', 'cross_entropy', 'entropy', 'kl_divergence', 'freq_pct_inc','freq_pct_inc_abs']].corr().round(4)

Unnamed: 0,cosine_similarity,cross_entropy,entropy,kl_divergence,freq_pct_inc,freq_pct_inc_abs
cosine_similarity,1.0,-0.0605,-0.4742,0.3289,-0.0439,-0.0451
cross_entropy,-0.0605,1.0,-0.002,0.633,0.0156,0.0197
entropy,-0.4742,-0.002,1.0,-0.7754,-0.1248,-0.1223
kl_divergence,0.3289,0.633,-0.7754,1.0,0.1065,0.1071
freq_pct_inc,-0.0439,0.0156,-0.1248,0.1065,1.0,0.9999
freq_pct_inc_abs,-0.0451,0.0197,-0.1223,0.1071,0.9999,1.0


### Fixed Based - Spearman Correlation

In [6]:
df_fixed[['cosine_similarity', 'cross_entropy', 'entropy', 'kl_divergence', 'freq_pct_inc', 'freq_pct_inc_abs']].corr(method='spearman').round(4)

Unnamed: 0,cosine_similarity,cross_entropy,entropy,kl_divergence,freq_pct_inc,freq_pct_inc_abs
cosine_similarity,1.0,-0.1733,-0.3851,0.3099,-0.0445,-0.135
cross_entropy,-0.1733,1.0,-0.0186,0.4236,-0.2633,-0.0743
entropy,-0.3851,-0.0186,1.0,-0.8309,-0.2814,-0.1981
kl_divergence,0.3099,0.4236,-0.8309,1.0,0.0843,0.1408
freq_pct_inc,-0.0445,-0.2633,-0.2814,0.0843,1.0,0.7022
freq_pct_inc_abs,-0.135,-0.0743,-0.1981,0.1408,0.7022,1.0


### Chain Based - Pearson Correlation

In [7]:
df_chain[['cosine_similarity', 'cross_entropy', 'entropy', 'kl_divergence', 'freq_pct_inc','freq_pct_inc_abs']].corr().round(4)


Unnamed: 0,cosine_similarity,cross_entropy,entropy,kl_divergence,freq_pct_inc,freq_pct_inc_abs
cosine_similarity,1.0,-0.3766,-0.5202,0.3874,-0.0361,-0.048
cross_entropy,-0.3766,1.0,0.7201,-0.079,0.0188,0.0378
entropy,-0.5202,0.7201,1.0,-0.7486,-0.0206,-0.0068
kl_divergence,0.3874,-0.079,-0.7486,1.0,0.0476,0.0459
freq_pct_inc,-0.0361,0.0188,-0.0206,0.0476,1.0,0.9951
freq_pct_inc_abs,-0.048,0.0378,-0.0068,0.0459,0.9951,1.0


### Chain Based - Spearman Correlation

In [8]:
df_chain[['cosine_similarity', 'cross_entropy', 'entropy', 'kl_divergence', 'freq_pct_inc','freq_pct_inc_abs']].corr(method='spearman').round(4)

Unnamed: 0,cosine_similarity,cross_entropy,entropy,kl_divergence,freq_pct_inc,freq_pct_inc_abs
cosine_similarity,1.0,-0.3154,-0.4882,0.4055,-0.0014,-0.2198
cross_entropy,-0.3154,1.0,0.451,0.1785,-0.1781,0.137
entropy,-0.4882,0.451,1.0,-0.6696,-0.122,0.1254
kl_divergence,0.4055,0.1785,-0.6696,1.0,-0.0054,-0.0957
freq_pct_inc,-0.0014,-0.1781,-0.122,-0.0054,1.0,0.3116
freq_pct_inc_abs,-0.2198,0.137,0.1254,-0.0957,0.3116,1.0


In [9]:
top_kl_div = df_chain.sort_values(['year', 'kl_divergence'], ascending=False).groupby('year').head(10)
top_kl_div[['cosine_similarity', 'cross_entropy', 'entropy', 'kl_divergence', 'freq_pct_inc', 'freq_pct_inc_abs']].corr().round(4)

Unnamed: 0,cosine_similarity,cross_entropy,entropy,kl_divergence,freq_pct_inc,freq_pct_inc_abs
cosine_similarity,1.0,-0.5882,-0.45,-0.505,-0.1894,-0.1907
cross_entropy,-0.5882,1.0,0.4566,0.958,-0.0134,-0.0122
entropy,-0.45,0.4566,1.0,0.1823,-0.0869,-0.0855
kl_divergence,-0.505,0.958,0.1823,1.0,0.0132,0.0141
freq_pct_inc,-0.1894,-0.0134,-0.0869,0.0132,1.0,1.0
freq_pct_inc_abs,-0.1907,-0.0122,-0.0855,0.0141,1.0,1.0


## z Score
Computes the change in value of distance metrics (KL and Cosine) as a standardized distribution

In [10]:
def z_jump_score(x):
    diffs = x.diff().fillna(0)
    z_scores = (diffs - diffs.mean()) / diffs.std()
    return z_scores.round(4)

### Chain DataFrame

In [11]:
df_chain['z_score_kl'] = df_chain.groupby('word')['kl_divergence'].transform(z_jump_score)
df_chain['z_score_cosine'] = df_chain.groupby('word')['cosine_similarity'].transform(z_jump_score)

In [12]:
df_chain['z_score_kl_max'] = df_chain.groupby('word')['z_score_kl'].transform('max')
df_chain['z_score_kl_min'] = df_chain.groupby('word')['z_score_kl'].transform('min')
df_chain['z_score_cosine_max'] = df_chain.groupby('word')['z_score_cosine'].transform('max')
df_chain['z_score_cosine_min'] = df_chain.groupby('word')['z_score_cosine'].transform('min')

### Fixed DataFrame

In [13]:
df_fixed['z_score_kl'] = df_fixed.groupby('word')['kl_divergence'].transform(z_jump_score)
df_fixed['z_score_cosine'] = df_fixed.groupby('word')['cosine_similarity'].transform(z_jump_score)

In [25]:
df_fixed['z_score_kl_max'] = df_fixed.groupby('word')['z_score_kl'].transform('max')
df_fixed['z_score_kl_min'] = df_fixed.groupby('word')['z_score_kl'].transform('min')
df_fixed['z_score_cosine_max'] = df_fixed.groupby('word')['z_score_cosine'].transform('max')
df_fixed['z_score_cosine_min'] = df_fixed.groupby('word')['z_score_cosine'].transform('min')

### Cherry-pick 🍒

In [205]:
df_chain[df_chain.word=="trump"][['word', 'count', 'count_t-1', 'year', 'ref_year', 'euclidean_distance',
                                   'cosine_similarity','kl_divergence', 'freq_pct_inc', 'freq_pct_inc_abs', 'z_score_kl',
                                   'z_score_kl_max', 'z_score_kl_min', 'z_score_cosine', 'z_score_cosine_max', 'z_score_cosine_min']].T

Unnamed: 0,17751,76521,197668,240527,308817,366073,447011,513175,587413,664661,743514,795999,913090,958306,1008540,1091136,1170853,1245592,1314169,1386926,1456261,1515341,1576754,1637720,1711198,1774782,1847859,1920314,1996056,2074673,2158413,2234305,2306220,2384538,2469235,2540402,2602349,2680085,2754759,2828564,2902252,2969943,3062284,3134590,3213714,3290032,3363311,3439924,3517679,3604620,3694258,3785261,3875750,3963195,4047919,4133375,4199929,4226464,4295065,4350061,4457200,4564754,4673508,4783728,4893209,4979704,5065994,5147723,5246163,5361672,5470770,5576995,5691604,5799703,5894469,5992608,6105301,6219763,6340623,6445015,6564180,6676321,6786748,6914678,7043508,7117992,7194202,7271718,7339929,7407257,7473996,7536467,7593856,7653726,7709940,7761556,7800737,7848133
word,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump,trump
count,9,8,4,8,4,8,5,9,10,8,4,18,1,4,25,20,19,15,17,12,10,17,25,33,12,24,24,37,34,22,11,12,19,15,7,10,64,38,32,36,32,55,55,64,50,50,41,22,59,49,58,48,48,61,40,18,4,46,11,96,242,301,346,304,266,360,317,826,680,362,244,199,534,274,296,234,477,443,140,180,122,466,495,411,238,204,299,198,277,142,89,78,2280,11175,13595,7325,10125,9583
count_t-1,21.0,9.0,,4.0,8.0,4.0,8.0,5.0,9.0,10.0,8.0,4.0,18.0,1.0,4.0,25.0,20.0,19.0,15.0,17.0,12.0,10.0,17.0,25.0,33.0,12.0,24.0,24.0,37.0,34.0,22.0,11.0,12.0,19.0,15.0,7.0,10.0,64.0,38.0,32.0,36.0,32.0,55.0,55.0,64.0,50.0,50.0,41.0,22.0,59.0,49.0,58.0,48.0,48.0,61.0,40.0,18.0,4.0,46.0,11.0,96.0,242.0,301.0,346.0,304.0,266.0,360.0,317.0,826.0,680.0,362.0,244.0,199.0,534.0,274.0,296.0,234.0,477.0,443.0,140.0,180.0,122.0,466.0,495.0,411.0,238.0,204.0,299.0,198.0,277.0,142.0,89.0,78.0,2280.0,11175.0,13595.0,7325.0,10125.0
year,1921,1922,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
ref_year,1920,1921,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
euclidean_distance,3.6164,2.3118,,2.0189,1.841,1.8627,1.8011,1.7421,2.0072,1.8059,1.5403,2.1888,2.2185,1.1889,2.7928,2.9752,3.2019,2.5981,2.303,2.6441,2.5368,2.638,3.2802,4.0893,4.1364,3.863,3.6565,4.0717,3.9707,3.5549,3.5823,2.5148,3.2004,3.4387,2.1366,2.4847,6.166,5.4706,4.4462,4.1872,4.5083,4.9981,5.768,5.7061,5.0849,5.7939,5.0952,5.1935,5.5364,6.4146,5.8866,4.8303,5.4374,6.573,6.6721,4.0613,2.8734,5.3621,5.5211,7.13,11.1224,11.6607,12.6544,11.3184,11.7084,11.6486,12.5926,16.7694,16.0079,16.1379,11.9692,10.0238,12.0642,12.6839,12.2728,9.9169,15.1298,14.7783,14.1031,9.6934,10.5398,15.4102,15.0055,13.8137,12.3225,9.3696,12.3064,11.5401,11.5495,11.7395,9.406,7.3757,21.7849,12.9724,13.287,7.1517,7.6956,7.7852
cosine_similarity,0.1779,0.13,,0.0719,0.2196,0.1507,0.179,0.2306,0.2245,0.2826,0.2464,0.2241,0.061,0.2136,0.1554,0.4302,0.2056,0.1132,0.34,0.2511,0.2484,0.6688,0.6656,0.5465,0.3111,0.2274,0.5218,0.4589,0.4557,0.4739,0.1918,0.0847,0.3081,0.1673,0.3052,-0.1433,0.1509,0.526,0.5191,0.6515,0.5763,0.6165,0.657,0.6894,0.747,0.7071,0.77,0.6759,0.6817,0.6385,0.6478,0.6948,0.5925,0.5207,0.4991,0.6521,0.3286,0.3691,0.2369,0.4014,0.6689,0.7252,0.7126,0.7684,0.7375,0.7724,0.7356,0.6355,0.7262,0.6801,0.7252,0.7521,0.7765,0.7493,0.713,0.8073,0.5605,0.6106,0.5089,0.6022,0.4799,0.4998,0.6148,0.626,0.6341,0.6754,0.5761,0.6367,0.5755,0.525,0.2001,0.2075,0.4112,0.8349,0.7324,0.9073,0.8997,0.8975
kl_divergence,0.6788,0.5209,,0.5733,0.6615,0.4022,0.3689,0.3562,0.3485,0.4081,0.3621,0.4471,0.4141,0.321,0.5582,0.3807,0.5926,0.5464,0.43,0.5309,0.4817,0.5079,0.6807,0.8214,0.5161,0.5371,0.4966,0.6322,0.5686,0.4752,0.5666,0.3684,0.4536,0.5692,0.5093,0.4427,0.8156,0.5915,0.6243,0.6763,0.6936,0.6835,0.9952,0.9119,0.6351,0.8656,0.9637,0.8179,0.5861,0.7778,0.8389,0.7091,0.6137,1.0333,0.9092,0.647,0.6859,1.3157,0.9746,1.3394,1.9023,2.2685,2.3779,1.9605,2.0605,2.1805,2.1715,4.6122,4.0834,3.8142,1.8733,1.6749,2.3656,9.7729,2.3381,1.4106,2.6568,2.612,2.7194,1.2978,1.5237,3.0636,2.9456,1.9179,2.4973,1.4925,2.5408,2.5177,2.2634,2.7081,1.6008,1.2337,8.8163,4.1646,3.5543,1.5125,1.8054,1.8799
freq_pct_inc,-0.5714,-0.1111,,1.0,-0.5,1.0,-0.375,0.8,0.1111,-0.2,-0.5,3.5,-0.9444,3.0,5.25,-0.2,-0.05,-0.2105,0.1333,-0.2941,-0.1667,0.7,0.4706,0.32,-0.6364,1.0,0.0,0.5417,-0.0811,-0.3529,-0.5,0.0909,0.5833,-0.2105,-0.5333,0.4286,5.4,-0.4062,-0.1579,0.125,-0.1111,0.7188,0.0,0.1636,-0.2188,0.0,-0.18,-0.4634,1.6818,-0.1695,0.1837,-0.1724,0.0,0.2708,-0.3443,-0.55,-0.7778,10.5,-0.7609,7.7273,1.5208,0.2438,0.1495,-0.1214,-0.125,0.3534,-0.1194,1.6057,-0.1768,-0.4676,-0.326,-0.1844,1.6834,-0.4869,0.0803,-0.2095,1.0385,-0.0713,-0.684,0.2857,-0.3222,2.8197,0.0622,-0.1697,-0.4209,-0.1429,0.4657,-0.3378,0.399,-0.4874,-0.3732,-0.1236,28.2308,3.9013,0.2166,-0.4612,0.3823,-0.0535
freq_pct_inc_abs,0.5714,0.1111,,1.0,0.5,1.0,0.375,0.8,0.1111,0.2,0.5,3.5,0.9444,3.0,5.25,0.2,0.05,0.2105,0.1333,0.2941,0.1667,0.7,0.4706,0.32,0.6364,1.0,0.0,0.5417,0.0811,0.3529,0.5,0.0909,0.5833,0.2105,0.5333,0.4286,5.4,0.4062,0.1579,0.125,0.1111,0.7188,0.0,0.1636,0.2188,0.0,0.18,0.4634,1.6818,0.1695,0.1837,0.1724,0.0,0.2708,0.3443,0.55,0.7778,10.5,0.7609,7.7273,1.5208,0.2438,0.1495,0.1214,0.125,0.3534,0.1194,1.6057,0.1768,0.4676,0.326,0.1844,1.6834,0.4869,0.0803,0.2095,1.0385,0.0713,0.684,0.2857,0.3222,2.8197,0.0622,0.1697,0.4209,0.1429,0.4657,0.3378,0.399,0.4874,0.3732,0.1236,28.2308,3.9013,0.2166,0.4612,0.3823,0.0535


In [16]:
df_fixed[df_fixed.word=="clinton"][['word', 'count', 'count_t-1', 'year', 'ref_year', 'euclidean_distance',
                                   'cosine_similarity','kl_divergence', 'freq_pct_inc', 'freq_pct_inc_abs', 'z_score_kl',
                                   'z_score_kl_max', 'z_score_cosine', 'z_score_cosine_max']]

Unnamed: 0,word,count,count_t-1,year,ref_year,euclidean_distance,cosine_similarity,kl_divergence,freq_pct_inc,freq_pct_inc_abs,z_score_kl,z_score_kl_max,z_score_cosine,z_score_cosine_max
3438,clinton,144,79.0,1921,1920,7.0033,0.7531,1.1806,0.8228,0.8228,-0.0273,,0.0650,
63320,clinton,118,79.0,1922,1920,6.1409,0.7518,1.2155,0.4937,0.4937,0.0021,,0.0520,
117340,clinton,60,79.0,1923,1920,6.6182,0.5253,1.7838,-0.2405,0.2405,0.4509,,-2.1977,
163796,clinton,8,79.0,1924,1920,7.3022,0.4027,2.5376,-0.8987,0.8987,0.6070,,-1.1598,
178306,clinton,110,79.0,1925,1920,6.9424,0.6297,1.5076,0.3924,0.3924,-0.8939,,2.3327,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7653739,clinton,4622,79.0,2016,1920,24.4684,0.2579,11.5540,57.5063,57.5063,0.9403,,0.3587,
7710713,clinton,396,79.0,2017,1920,17.9819,0.1379,6.5865,4.0127,4.0127,-4.2069,,-1.1338,
7762406,clinton,206,79.0,2018,1920,15.2845,0.1059,5.8540,1.6076,1.6076,-0.6436,,-0.2547,
7801987,clinton,221,79.0,2019,1920,16.1019,0.1201,5.4268,1.7975,1.7975,-0.3867,,0.2068,


## Correlation Analysis
### KL Divergence - Chain Based

In [17]:
head = 1000
top_kl_div_CB = df_chain.sort_values(['year', 'z_score_kl'], ascending=False).groupby('year').head(head)
top_kl_div_CB[['freq_pct_inc', 'freq_pct_inc_abs', 'z_score_kl']].corr().round(4)

Unnamed: 0,freq_pct_inc,freq_pct_inc_abs,z_score_kl
freq_pct_inc,1.0,0.9995,0.0752
freq_pct_inc_abs,0.9995,1.0,0.0793
z_score_kl,0.0752,0.0793,1.0


In [18]:
top_kl_div_CB

Unnamed: 0,word,count,count_t-1,year,ref_year,euclidean_distance,cosine_similarity,cross_entropy,kl_divergence,entropy,entropy_t-1,mean,median,std,iqr,range,word_present_both,freq_pct_inc,freq_pct_inc_abs,z_score_kl,z_score_cosine,z_score_kl_max,z_score_kl_min,z_score_cosine_max,z_score_cosine_min
7848541,met,513,1533.0,2020,2019,12.9543,0.7949,81.5789,19.6476,61.9312,60.2009,0.4881,0.4819,0.1900,0.2490,0.9465,True,-0.6654,0.6654,9.2101,-0.9651,9.2101,-1.0553,3.2705,-3.4957
7849662,dating,155,390.0,2020,2019,12.1119,0.7520,74.9130,8.6469,66.2661,62.9954,0.4967,0.4841,0.1315,0.1683,0.8223,True,-0.6026,0.6026,9.1573,-0.8978,9.1573,-1.5690,3.6380,-3.4153
7848544,reopening,511,22.0,2020,2019,16.0609,0.1190,69.6697,5.3815,64.2882,68.6628,0.5072,0.5018,0.1628,0.2044,0.8497,True,22.2273,22.2273,8.8399,1.0351,8.8399,-1.4163,2.4870,-3.1339
7860337,measles,8,169.0,2020,2019,13.1266,0.5051,80.6315,11.8569,68.7746,65.7877,0.4993,0.5007,0.0718,0.0673,0.6153,True,-0.9527,0.9527,8.8029,-0.2248,8.8029,-1.5417,3.2130,-2.2037
7848149,2020,3058,1698.0,2020,2019,18.5633,0.4604,75.9783,12.3783,63.6000,63.6341,0.5067,0.5315,0.1685,0.2292,1.0000,True,0.8009,0.8009,8.3321,-0.8207,8.3321,-1.0684,2.6508,-2.4302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41038,17senator,2,4.0,1921,1920,0.7353,0.3101,69.4160,0.4915,68.9245,68.7657,0.4888,0.4978,0.0654,0.0611,0.6016,True,-0.5000,0.5000,0.4082,-0.4082,0.4082,-2.0412,2.0412,-0.4082
41352,workon,2,,1921,1920,,,,,68.9123,,0.4882,0.4941,0.0657,0.0612,0.5994,False,,,0.4082,0.4082,0.4082,-2.0412,0.4082,-2.0412
41612,plonsky,2,,1921,1920,,,,,68.9026,,0.4884,0.4967,0.0661,0.0628,0.6042,False,,,0.4082,-0.4082,0.4082,-2.0412,2.0412,-0.4082
41933,5edward,2,2.0,1921,1920,0.6886,0.4467,69.3863,0.4685,68.9178,68.7621,0.4890,0.4976,0.0657,0.0602,0.6012,True,0.0000,0.0000,0.4082,0.4082,0.4082,-2.0412,0.4082,-2.0412


### Cosine Similarity - Chain Based

In [19]:
top_cosine_CB = df_chain.sort_values(['year', 'z_score_cosine'], ascending=False).groupby('year').head(head)
top_cosine_CB[['freq_pct_inc', 'freq_pct_inc_abs', 'z_score_cosine']].corr().round(4)

Unnamed: 0,freq_pct_inc,freq_pct_inc_abs,z_score_cosine
freq_pct_inc,1.0,0.9661,-0.0673
freq_pct_inc_abs,0.9661,1.0,-0.0856
z_score_cosine,-0.0673,-0.0856,1.0


In [20]:
top_cosine_CB

Unnamed: 0,word,count,count_t-1,year,ref_year,euclidean_distance,cosine_similarity,cross_entropy,kl_divergence,entropy,entropy_t-1,mean,median,std,iqr,range,word_present_both,freq_pct_inc,freq_pct_inc_abs,z_score_kl,z_score_cosine,z_score_kl_max,z_score_kl_min,z_score_cosine_max,z_score_cosine_min
7860769,airway,8,19.0,2020,2019,3.4535,0.6788,69.4182,0.7529,68.6653,68.4582,0.4976,0.5021,0.0754,0.0845,0.6405,True,-0.5789,0.5789,-0.4090,6.3989,4.6256,-3.0934,6.3989,-4.4012
7858379,squash,11,17.0,2020,2019,2.2569,0.7167,69.2367,0.5902,68.6465,68.6375,0.5010,0.5068,0.0761,0.0894,0.6691,True,-0.3529,0.3529,-0.1143,5.6875,3.4581,-6.1048,5.6875,-4.3684
7872605,larchmont,2,2.0,2020,2019,0.5516,0.7962,69.3438,0.5368,68.8071,68.8039,0.5009,0.4991,0.0706,0.0732,0.6366,True,0.0000,0.0000,-0.0923,5.2124,2.8154,-5.3023,5.2124,-2.6628
7855565,tessa,19,17.0,2020,2019,3.7436,0.6555,69.1935,0.8872,68.3063,68.6866,0.5092,0.5129,0.0859,0.1062,0.6772,True,0.1176,0.1176,0.8435,5.2069,4.3545,-3.2536,5.2069,-2.7875
7855479,nova,19,14.0,2020,2019,2.5704,0.6759,69.2950,0.6730,68.6220,68.6950,0.5029,0.5055,0.0769,0.0986,0.6776,True,0.3571,0.3571,0.1192,4.9808,5.0581,-3.2196,4.9808,-2.6785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53076,ofthree,1,1.0,1921,1920,0.6269,0.1562,69.3796,0.4520,68.9276,68.7560,0.4883,0.4971,0.0652,0.0624,0.5962,True,0.0000,0.0000,-0.3780,0.3780,2.2678,-0.3780,0.3780,-2.2678
53287,apponyl,1,7.0,1921,1920,1.5156,0.2091,69.4027,0.4758,68.9269,68.7320,0.4886,0.4956,0.0653,0.0607,0.6010,True,-0.8571,0.8571,-0.3780,0.3780,2.2678,-0.3780,0.3780,-2.2678
53674,dulland,1,4.0,1921,1920,1.1728,0.2075,69.4070,0.5023,68.9046,68.7403,0.4882,0.4965,0.0660,0.0610,0.6003,True,-0.7500,0.7500,-0.3780,0.3780,2.2678,-0.3780,0.3780,-2.2678
53831,foundrymens,1,,1921,1920,,,,,68.9276,,0.4890,0.4985,0.0653,0.0604,0.6034,False,,,0.3780,0.3780,0.3780,-2.2678,0.3780,-2.2678


### KL Divergence - Fixed Chain

In [21]:
top_kl_div_FB = df_fixed.sort_values(['year', 'z_score_kl'], ascending=False).groupby('year').head(head)
top_kl_div_FB[['freq_pct_inc', 'freq_pct_inc_abs', 'z_score_kl']].corr().round(4)

Unnamed: 0,freq_pct_inc,freq_pct_inc_abs,z_score_kl
freq_pct_inc,1.0,1.0,0.1127
freq_pct_inc_abs,1.0,1.0,0.1124
z_score_kl,0.1127,0.1124,1.0


### Cosine Similarity - Fixed Chain

In [26]:
top_cosine_FB = df_fixed.sort_values(['year', 'z_score_cosine'], ascending=False).groupby('year').head(head)
top_cosine_FB[['freq_pct_inc', 'freq_pct_inc_abs', 'z_score_cosine']].corr().round(4)

Unnamed: 0,freq_pct_inc,freq_pct_inc_abs,z_score_cosine
freq_pct_inc,1.0,0.9997,-0.008
freq_pct_inc_abs,0.9997,1.0,-0.0077
z_score_cosine,-0.008,-0.0077,1.0


In [27]:
top_cosine_FB

Unnamed: 0,word,count,count_t-1,year,ref_year,euclidean_distance,cosine_similarity,cross_entropy,kl_divergence,entropy,entropy_t-1,mean,median,std,iqr,range,word_present_both,freq_pct_inc,freq_pct_inc_abs,z_score_kl,z_score_cosine,z_score_kl_max,z_score_cosine_max,z_score_kl_min,z_score_cosine_min
7854023,output,26,298.0,2020,1920,13.9755,0.5119,72.8401,4.2715,68.5686,65.3852,0.4999,0.5051,0.0787,0.0871,0.6537,True,-0.9128,0.9128,-1.9955,5.8864,2.8109,5.8864,-2.7737,-4.8640
7868021,tray,4,3.0,2020,1920,1.2526,0.4037,69.8444,1.0954,68.7490,68.7951,0.5015,0.5026,0.0726,0.0785,0.6461,True,0.3333,0.3333,0.4612,5.0239,2.6794,5.0239,-4.7326,-3.6025
7857347,cooperative,13,326.0,2020,1920,13.2071,0.3971,72.3049,3.6137,68.6912,66.2497,0.4966,0.4938,0.0746,0.0666,0.6458,True,-0.9601,0.9601,0.0406,4.5311,3.0055,4.5311,-3.0970,-2.7183
7849190,date,226,613.0,2020,1920,13.6338,0.4529,71.3343,4.3570,66.9772,65.5650,0.4986,0.5050,0.1186,0.1543,0.7240,True,-0.6313,0.6313,-4.9024,4.4526,3.2778,4.4526,-4.9024,-2.5451
7865327,handicap,5,1078.0,2020,1920,21.2424,0.2535,78.0861,9.3351,68.7510,62.4589,0.4982,0.4979,0.0725,0.0791,0.6663,True,-0.9954,0.9954,0.1022,4.3901,3.2659,4.3901,-3.9916,-4.1681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46510,chamberof,2,3.0,1921,1920,1.2606,0.3965,69.3816,0.4642,68.9174,68.7254,0.4890,0.4981,0.0657,0.0547,0.5974,True,-0.3333,0.3333,-0.6925,0.2616,1.1190,1.1670,-0.9805,-1.2384
42446,gridders,2,1.0,1921,1920,1.0163,0.1860,69.3731,0.4619,68.9111,68.7695,0.4886,0.4940,0.0659,0.0581,0.6066,True,1.0000,1.0000,-0.5603,0.2615,1.2711,1.7805,-1.9756,-1.6920
19137,brusiloff,8,12.0,1921,1920,2.4911,0.4497,69.4000,0.5112,68.8888,68.6617,0.4891,0.4980,0.0668,0.0677,0.6050,True,-0.3333,0.3333,-0.9743,0.2614,1.1913,1.2605,-0.9743,-1.5263
27251,30general,4,12.0,1921,1920,1.9975,0.2898,69.3921,0.4773,68.9148,68.7196,0.4889,0.4978,0.0658,0.0575,0.6103,True,-0.6667,0.6667,-0.5442,0.2609,0.9714,0.7566,-1.1271,-1.4684


## Save to Excel

In [28]:
dfs_to_save = [top_cosine_FB, top_kl_div_FB, top_cosine_CB, top_kl_div_CB]
dfs_to_save = [df.round(4) for df in dfs_to_save]
sheet_names = [
    "Cosine_Fixed_Base",
    "KLDiv_Fixed_Base",
    "Cosine_Chain_Base",
    "KLDiv_Chain_Base"
]

with pd.ExcelWriter("embedding_shift_metrics_1k.xlsx", engine="xlsxwriter") as writer:
    for df, sheet in zip(dfs_to_save, sheet_names):
        df.to_excel(writer, sheet_name=sheet, index=False)

In [None]:
df_fixed.to_csv((os.path.join(NOTEBOOKS_ANALYSIS, "fixed_base_with_z_score.csv")))
df_chain.to_csv((os.path.join(NOTEBOOKS_ANALYSIS, "chain_base_with_z_score.csv")))