In [1]:
# imports
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
import json
import seaborn as sns
from rich import print
import time

load_dotenv()

RAW_DATA_DIR = Path(os.getenv('RAW_DATA_DIR'))
PROCESSED_DATA_DIR = Path(os.getenv('PROCESSED_DATA_DIR'))

In [10]:
total_size_mb = 0
total_rows = 0

for views_file in RAW_DATA_DIR.glob('*views*.csv'):
    rows = pl.read_csv(views_file).height
    size_mb = os.stat(views_file).st_size / 1024 ** 2
    total_size_mb += size_mb
    total_rows += rows
    print(views_file.name, f'{size_mb:,.0f}', f'{rows:,}')
print(total_rows, total_size_mb)


In [12]:
views_df = pl.read_parquet(PROCESSED_DATA_DIR / 'pageviews_all.parquet')
views_df.height * views_df.width

207549090

In [14]:
from scipy import stats
import numpy as np
def regression_per_row(X: np.ndarray):
    # normalize X
    X = (X - X.mean(axis=1, keepdims=True)) / X.std(axis=1, keepdims=True)
    
    # X range (time)
    m = X.shape[1] - 1
    X_range = np.arange(X.shape[1])
    X_range_mean = m / 2
    
    # slopes, intercepts, residuals
    mean = X.mean(axis=1, keepdims=True)
    slopes = ((X - mean).dot(X_range - X_range_mean) / (m * np.var(X_range))).reshape(-1, 1)
    intercepts = mean - slopes * X_range_mean
    residuals = X - (slopes * X_range + intercepts)
    
    # rss, tss, r2
    rss = np.sum(np.square(residuals), axis=1, keepdims=True)
    tss = np.sum(np.square(X - mean), axis=1, keepdims=True)
    r2s = 1 - rss / tss
    
    # pvalues
    se = X.std(axis=1, keepdims=True) / np.sqrt(np.sum(np.square(X_range - X_range_mean)))
    statistics = slopes / se
    pvs = 2 * (1 - stats.t.cdf(np.abs(statistics), m))
    
    return slopes, intercepts, r2s, pvs
    
def is_rising(slopes: np.ndarray, pvs: np.ndarray):
    return np.logical_and(slopes > 0, pvs > 0.05)

def is_fading(slopes: np.ndarray, pvs: np.ndarray):
    return np.logical_and(slopes < 0, pvs > 0.05)

In [16]:
regression_per_row(np.random.randint(1, 10, (1000, 100)))

(array([[ 9.28599153e-04],
        [ 5.97782670e-04],
        [ 3.77025283e-03],
        [-1.12008451e-03],
        [-2.17027418e-04],
        [-7.17646755e-04],
        [ 5.50699921e-03],
        [ 1.28177846e-03],
        [ 1.25064551e-03],
        [-1.79903760e-03],
        [-3.30606667e-03],
        [-1.61399026e-03],
        [ 3.12132147e-03],
        [ 2.42814831e-03],
        [-3.52399538e-03],
        [-3.41078994e-03],
        [-3.25195528e-03],
        [-4.00698134e-03],
        [ 1.66661767e-03],
        [ 1.19245956e-03],
        [-6.84098611e-03],
        [-2.07338669e-03],
        [-1.67854581e-03],
        [-2.72191734e-04],
        [-1.84625078e-03],
        [ 3.26017843e-04],
        [ 2.04692362e-03],
        [-2.95810010e-03],
        [-5.79621697e-03],
        [-1.58673624e-03],
        [ 3.29111663e-03],
        [ 2.58259375e-03],
        [ 2.89965688e-03],
        [ 2.62721372e-03],
        [-6.14481471e-03],
        [-5.30241379e-03],
        [-1.71869367e-03],
 

----

seasonal

In [18]:
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy import stats

def detect_seasonality(data):
    max_period = data.shape[1] // 2  # Maximum possible period
    
    # Create an array to store the p-values
    p_values = np.empty((data.shape[0], max_period))
    
    # Iterate over possible periods and compute p-values for each row
    for period in range(1, max_period + 1):
        result = seasonal_decompose(data, period=period, extrapolate_trend='freq')
        _, p_values[:, period - 1] = stats.normaltest(result.seasonal)
    
    # Check if any row has a significant p-value
    significant_rows = np.any(p_values < 0.05, axis=1)
    
    return significant_rows

# Example usage
data = np.random.rand(10, 100)  # Replace this with your 2D time series data

seasonal_rows = detect_seasonality(data)
print("Indices of rows with seasonality:", np.where(seasonal_rows)[0])


  k, _ = kurtosistest(a, axis)


ValueError: could not broadcast input array from shape (100,) into shape (10,)

In [19]:
from scipy.stats import kruskal

kruskal([1, 2, 1, 2, 1])

ValueError: Need at least two groups in stats.kruskal()

-----

Notable People

In [10]:
ext = pl.read_csv(RAW_DATA_DIR / 'cross-verified-database.csv',
                  ignore_errors=True)

In [11]:
ext

wikidata_code,birth,death,updated_death_date,approx_birth,approx_death,birth_min,birth_max,death_min,death_max,gender,level1_main_occ,name,un_subregion,birth_estimation,death_estimation,bigperiod_birth_graph_b,bigperiod_death_graph_b,curid,level2_main_occ,freq_main_occ,freq_second_occ,level2_second_occ,level3_main_occ,bigperiod_birth,bigperiod_death,wiki_readers_2015_2018,non_missing_score,total_count_words_b,number_wiki_editions,total_noccur_links_b,sum_visib_ln_5criteria,ranking_visib_5criteria,all_geography_groups,string_citizenship_raw_d,citizenship_1_b,citizenship_2_b,list_areas_of_rattach,area1_of_rattachment,area2_of_rattachment,list_wikipedia_editions,un_region,group_wikipedia_editions,bplo1,dplo1,bpla1,dpla1,pantheon_1,level3_all_occ
str,i64,i64,i64,str,str,i64,i64,i64,i64,str,str,str,str,f64,f64,str,str,i64,str,f64,f64,str,str,str,str,i64,i64,i64,i64,i64,f64,f64,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,i64,str
"""Q1000002""",1932,1990,,,,1932,1932,1990,1990,"""Male""","""Culture""","""Claus_Hammel""","""Western Europe…",1932.0,1990.0,"""5.Contemporary…","""5.Contemporary…",2949539,"""Culture-core""",0.8,0.2,"""Culture-periph…","""playwright""","""5.Contemporary…","""5.Contemporary…",1669,3,1777,1,11,18.083672,1.058542e6,"""Germany""","""'Germany'""","""Germany""",,"""D:_'Germany'_m…","""Germany""","""Missing""","""dewiki""","""Europe""","""grB""",11.833333,12.42,53.416668,54.38139,0,"""D:_playwright_…"
"""Q1000005""",1860,1927,,,,1860,1860,1927,1927,"""Male""","""Culture""","""Karel_Matěj_Ča…","""Western Europe…",1860.0,1927.0,"""4.Mid Modern P…","""5.Contemporary…",4217319,"""Culture-core""",0.538462,0.307692,"""Culture-periph…","""writer""","""4.Mid Modern P…","""5.Contemporary…",25008,3,6491,9,15,23.98061,131428.0,"""Czech_Republic…","""'Czech_Republi…","""Czech_Republic…",,"""D:_'Czech_Repu…","""Old_(before_ye…","""Missing""","""dewiki|cswiki|…","""Europe""","""grA""",12.929798,14.421389,49.440605,50.087502,0,"""D:_writer_jour…"
"""Q1000006""",1971,,,,,1971,1971,,,"""Male""","""Culture""","""Florian_Eichin…","""Western Europe…",1971.0,2053.8447,"""5.Contemporary…","""5.Contemporary…",5050967,"""Culture-core""",1.0,,"""Missing""","""film""","""5.Contemporary…","""Missing""",27285,3,1573,1,10,20.666656,775768.0,"""Germany""","""'Germany'""","""Germany""",,"""D:_'Germany'_m…","""Germany""","""Missing""","""dewiki""","""Europe""","""grB""",9.191944,,48.897499,,0,"""D:_film_screen…"
"""Q1000015""",1983,,,,,1983,1983,,,"""Male""","""Culture""","""Florian_Jahr""","""Western Europe…",1983.0,2067.1899,"""5.Contemporary…","""5.Contemporary…",2588583,"""Culture-core""",1.0,,"""Missing""","""actor""","""5.Contemporary…","""Missing""",37331,3,1931,1,10,21.18504,691735.0,"""Germany""","""'Germany'""","""Germany""",,"""D:_'Germany'_m…","""Germany""","""Missing""","""dewiki""","""Europe""","""grB""",13.383333,,52.516666,,0,"""D:_actor_P:_sc…"
"""Q1000023""",1912,1977,,,,1912,1912,1977,1977,"""Female""","""Leadership""","""Wiltraut_Rupp-…","""Western Europe…",1912.0,1977.0,"""5.Contemporary…","""5.Contemporary…",922120,"""Administration…",0.833333,0.166667,"""Politics""","""judge""","""5.Contemporary…","""5.Contemporary…",2955,3,1578,1,6,17.99621,1.103282e6,"""Germany""","""'Germany'""","""Germany""",,"""D:_'Germany'_m…","""Old_(before_ye…","""Missing""","""dewiki""","""Europe""","""grB""",13.35,8.4,52.4333,49.016666,0,"""D:_judge_juris…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Q999994""",1988,,,,,1988,1988,,,"""Male""","""Sports/Games""","""Mitja_Mežnar""","""Southern Europ…",1988.0,2065.3259,"""5.Contemporary…","""5.Contemporary…",20581328,"""Sports/Games""",0.9,,"""Missing""","""ski""","""5.Contemporary…","""Missing""",8896,3,5668,8,4,21.543238,350041.5,"""Slovenia""","""'Slovenia'""","""Slovenia""",,"""D:_'Slovenia'_…","""Slovenia""","""Missing""","""fiwiki|dewiki|…","""Europe""","""grA""",14.35561,,46.238869,,0,"""D:_ski_P:_ ski…"
"""Q999995""",1987,,,"""circa""",,1987,1987,,,"""Male""","""Sports/Games""","""Martin_Cikl""","""Western Europe…",1987.0,2065.1482,"""5.Contemporary…","""5.Contemporary…",16791925,"""Sports/Games""",0.923077,,"""Missing""","""ski""","""5.Contemporary…","""Missing""",8807,3,6096,9,4,21.71133,339942.5,"""Czech_Republic…","""'Czech_Republi…","""Czech_Republic…",,"""D:_'Czech_Repu…","""Czech_Republic…","""Missing""","""fiwiki|dewiki|…","""Europe""","""grA""",14.618354,,50.911613,,0,"""D:_ski_P:_ ski…"
"""Q999997""",1984,,,,,1984,1984,,,"""Male""","""Sports/Games""","""Vincent_Descom…","""Western Europe…",1984.0,2061.8308,"""5.Contemporary…","""5.Contemporary…",21384087,"""Sports/Games""",0.909091,,"""Missing""","""ski""","""5.Contemporary…","""Missing""",51786,3,14372,7,6,24.453411,154890.0,"""France""","""'France'""","""France""",,"""D:_'France'_ma…","""France""","""Missing""","""fiwiki|dewiki|…","""Europe""","""grA""",6.868889,,45.922222,,0,"""D:_ski_skier_P…"
"""Q999998""",1952,,,,,1952,1952,,,"""Male""","""Culture""","""José_Massaroli…","""South America""",1952.0,2032.8303,"""5.Contemporary…","""5.Contemporary…",11512070,"""Culture-core""",0.875,,"""Missing""","""artist""","""5.Contemporary…","""Missing""",3772,3,5570,3,6,20.193954,453953.0,"""Argentina""","""'Argentina'""","""Argentina""",,"""D:_'Argentina'…","""Argentina""","""Missing""","""fiwiki|enwiki|…","""America""","""grA""",-60.0,,-33.483334,,0,"""D:_artist_P:_ …"


- `name`: full name of the individual; 
- `group_wikipedia_editions`: partition category of the individual (from the A (*English*) edition of Wikipedia to F (\texttt{Wikidata} only) as described in Table 1.); 
- birth: birth date of the individual (either reported or estimated); 
- death: death date of the individual (either reported or estimated); 
- level1_main_occ & level2_main_occ & level2_second_occ & level3_all_occ & level3_main_occ & freq_main_ occ & freq_second_occ: set of seven variables for the main domain of influence of each individual in three

- `wikidata_code` - identifier
- `birth` - birth year of the individual
- `death` - death year of the individual
- `updated_death_date`
- `approx_birth`
- `approx_death`
- `birth_min`
- `birth_max`
- `death_min`
- `death_max`
- `gender`
- `level1_main_occ`
- `name`
- `un_subregion`
- `birth_estimation`
- `death_estimation`
- `bigperiod_birth_graph_b`
- `bigperiod_death_graph_b`
- `curid`
- `level2_main_occ`
- `freq_main_occ`
- `freq_second_occ`
- `level2_second_occ`
- `level3_main_occ`
- `bigperiod_birth`
- `bigperiod_death`
- `wiki_readers_2015_2018`
- `non_missing_score`
- `total_count_words_b`
- `number_wiki_editions`
- `total_noccur_links_b`
- `sum_visib_ln_5criteria`
- `ranking_visib_5criteria`
- `all_geography_groups`
- `string_citizenship_raw_d`
- `citizenship_1_b`
- `citizenship_2_b`
- `list_areas_of_rattach`
- `area1_of_rattachment`
- `area2_of_rattachment`
- `list_wikipedia_editions`
- `un_region`
- `group_wikipedia_editions`
- `bplo1`
- `dplo1`
- `bpla1`
- `dpla1`
- `pantheon_1`
- `level3_all_occ`

In [18]:
for col in ext:
    display(col.value_counts(sort=True))

wikidata_code,count
str,u32
"""Q1000002""",1
"""Q1000005""",1
"""Q1000006""",1
"""Q1000015""",1
"""Q1000023""",1
…,…
"""Q999994""",1
"""Q999995""",1
"""Q999997""",1
"""Q999998""",1


birth,count
i64,u32
,195919
1950,20227
1988,20215
1985,20134
1986,19998
…,…
-599,1
-274,1
-1440,1
-1648,1


death,count
i64,u32
,1244507
2015,12406
2013,12352
2014,12308
2016,12039
…,…
-452,1
-540,1
-1415,1
-593,1


updated_death_date,count
i64,u32
,2275077
2021.0,10321
2022.0,4194
2020.0,2225


approx_birth,count
str,u32
,2201780
"""century""",52613
"""circa""",37390
"""millenium""",34


approx_death,count
str,u32
,2259756
"""century""",16353
"""circa""",15666
"""millenium""",42


birth_min,count
i64,u32
,142965
1901,35437
1950,20230
1988,20215
1985,20134
…,…
-599,1
-274,1
-1440,1
-1648,1


birth_max,count
i64,u32
,142965
2000,30250
1950,20228
1988,20215
1985,20134
…,…
-599,1
-274,1
-1440,1
-1648,1


death_min,count
i64,u32
,1228132
2015,12406
2013,12352
2014,12308
2016,12039
…,…
-452,1
-540,1
-1415,1
-593,1


death_max,count
i64,u32
,1228132
2000,13205
2015,12406
2013,12352
2014,12308
…,…
-452,1
-540,1
-1415,1
-593,1


gender,count
str,u32
"""Male""",1901904
"""Female""",387906
,1398
"""Other""",609


level1_main_occ,count
str,u32
"""Culture""",702330
"""Sports/Games""",633450
"""Leadership""",619146
"""Discovery/Scie…",273229
"""Other""",48245
"""Missing""",15417


name,count
str,u32
,333
"""Harald_Hansen""",4
"""Rudolf_Abel""",3
"""Luís_de_Almeid…",3
"""Marco_Bianchi""",3
…,…
"""Mitja_Mežnar""",1
"""Martin_Cikl""",1
"""Vincent_Descom…",1
"""José_Massaroli…",1


un_subregion,count
str,u32
"""Western Europe…",779670
"""Northern Ameri…",474983
"""Southern Europ…",223089
"""Northern Europ…",138728
"""South America""",108613
…,…
"""Southern Afric…",13170
"""East Africa""",12841
"""Central Africa…",5634
"""Central Asia""",4017


birth_estimation,count
f64,u32
,112528
1951.0,43628
1950.0,20248
1988.0,20215
1985.0,20136
…,…
1183.2982,1
1287.3851,1
915.74115,1
1009.1835,1


death_estimation,count
f64,u32
,112528
2015.0,12406
2013.0,12352
2014.0,12308
2016.0,12039
…,…
1999.1826,1
2062.5371,1
1972.4149,1
-960.46045,1


bigperiod_birth_graph_b,count
str,u32
"""5.Contemporary…",1518405
"""4.Mid Modern P…",487305
,112528
"""3.Early Modern…",103722
"""2.Post-Classic…",55297
"""1.Ancient Hist…",14560


bigperiod_death_graph_b,count
str,u32
"""5.Contemporary…",1862123
"""4.Mid Modern P…",182459
,112528
"""3.Early Modern…",76031
"""2.Post-Classic…",45037
"""1.Ancient Hist…",13639


curid,count
i64,u32
2487564,4
5907876,4
2468499,4
1285971,4
4079979,4
…,…
20581328,1
16791925,1
21384087,1
11512070,1


level2_main_occ,count
str,u32
"""Sports/Games""",634945
"""Culture-core""",604250
"""Politics""",314558
"""Academia""",249298
"""Culture-periph…",97702
…,…
"""Explorer/Inven…",22814
"""Worker/Busines…",22796
"""Missing""",15417
"""Other""",14579


freq_main_occ,count
f64,u32
1.0,1223469
0.8,157649
0.6,127020
0.666667,105425
0.75,89919
…,…
0.9074074,1
0.58,1
0.975,1
0.55814,1


freq_second_occ,count
f64,u32
,1443910
0.2,230220
0.333333,137112
0.25,107689
0.4,85181
…,…
0.22449,1
0.2888889,1
0.34,1
0.372093,1


level2_second_occ,count
str,u32
"""Missing""",1443900
"""Culture-periph…",112624
"""Academia""",100439
"""Politics""",93102
"""Culture-core""",84396
…,…
"""Other""",39355
"""Religious""",33965
"""Nobility""",32096
"""Explorer/Inven…",30430


level3_main_occ,count
str,u32
"""politician""",270513
"""football""",250667
"""actor""",121790
"""writer""",74716
"""painter""",60189
…,…
"""parlamento""",1
"""capitão_do_por…",1
"""dermatologista…",1
"""carpinteir""",1


bigperiod_birth,count
str,u32
"""5.Contemporary…",1486919
"""4.Mid Modern P…",480273
"""Missing""",195919
"""3.Early Modern…",92494
"""2.Post-Classic…",31220
"""1.Ancient Hist…",4992


bigperiod_death,count
str,u32
"""Missing""",1244507
"""5.Contemporary…",747803
"""4.Mid Modern P…",179485
"""3.Early Modern…",72769
"""2.Post-Classic…",39756
"""1.Ancient Hist…",7497


wiki_readers_2015_2018,count
i64,u32
0,963
647,667
504,666
541,658
549,655
…,…
591064,1
278101,1
631988,1
147819,1


non_missing_score,count
i64,u32
3,2135295
2,153740
1,2781
0,1


total_count_words_b,count
i64,u32
1428,1375
1451,1361
1446,1357
1467,1342
1328,1341
…,…
46015,1
31585,1
227295,1
51178,1


number_wiki_editions,count
i64,u32
1,1210387
2,414158
3,199763
4,114299
5,74060
…,…
161,1
210,1
202,1
154,1


total_noccur_links_b,count
i64,u32
0,648242
1,381823
2,247897
3,185523
4,143226
…,…
116,1
124,1
101,1
108,1


sum_visib_ln_5criteria,count
f64,u32
15.279765,17
14.941894,16
14.388793,16
14.557631,16
14.783787,15
…,…
21.852451,1
21.543238,1
24.453411,1
20.193954,1


ranking_visib_5criteria,count
f64,u32
568043.5,8
1756537.5,6
2053427.5,6
401144.5,6
202331.5,6
…,…
402982.0,1
359457.0,1
154890.0,1
453953.0,1


all_geography_groups,count
str,u32
,446840
"""US""",338596
"""Germany""",199764
"""France""",138012
"""United_Kingdom…",109219
…,…
"""Poland,Vatican…",1
"""Canada,Germany…",1
"""Peru,Austria""",1
"""China,Old_regi…",1


string_citizenship_raw_d,count
str,u32
,446586
"""'US'""",338566
"""'Germany'""",199725
"""'France'""",137643
"""'United_Kingdo…",82728
…,…
"""'Peru'_'Austri…",1
"""'South_Africa'…",1
"""'China'_'Song_…",1
"""'Germany'_'Kin…",1


citizenship_1_b,count
str,u32
"""US""",405179
"""Germany""",261320
"""United_Kingdom…",214110
"""France""",150209
"""Italy""",108896
…,…
"""Dominican_Repu…",4
"""Macau""",4
"""Nagorno-Karaba…",3
"""Vatican_City""",2


citizenship_2_b,count
str,u32
,2199148
"""US""",11388
"""United_Kingdom…",9747
"""Russia""",7460
"""Germany""",5672
…,…
"""SÃ£o_TomÃ©_and…",1
"""Scotland""",1
"""Wales""",1
"""Overseas_Terri…",1


list_areas_of_rattach,count
str,u32
"""D:_'US'_matchB…",256489
"""D:_'Germany'_m…",185101
"""D:_'France'_ma…",129031
"""D:_'Italy'_mat…",69930
"""D:_'United_Kin…",65461
…,…
"""D:_'Germany'_'…",1
"""D:_'Principali…",1
"""D:_'Norway'_mi…",1
"""D:_'Canada'_mi…",1


area1_of_rattachment,count
str,u32
"""US""",402542
"""United_Kingdom…",213969
"""France""",148632
"""Old_(before_ye…",135395
"""Germany""",125925
…,…
"""Trinidad_and_t…",1
"""Overseas_Terri…",1
"""Old_(before_ye…",1
"""Old_(before_ye…",1


area2_of_rattachment,count
str,u32
"""Missing""",2199148
"""US""",11177
"""United_Kingdom…",9747
"""France""",4877
"""Old_(before_ye…",3546
…,…
"""Old_(before_ye…",1
"""Old_(before_ye…",1
"""Palau""",1
"""Old_(before_ye…",1


list_wikipedia_editions,count
str,u32
"""enwiki""",643321
"""dewiki""",258600
"""frwiki""",105256
"""itwiki""",62854
"""eswiki""",60941
…,…
"""fiwiki|dewiki|…",1
"""fiwiki|dewiki|…",1
"""fiwiki|dewiki|…",1
"""fiwiki|dewiki|…",1


un_region,count
str,u32
"""Europe""",1248310
"""America""",626491
"""Asia""",215085
"""Oceania""",81151
"""Africa""",67213
,53567


group_wikipedia_editions,count
str,u32
"""grA""",1547174
"""grB""",744643


bplo1,count
f64,u32
,587627
2.351389,17554
13.383333,15825
-74.0,15234
-0.1275,14228
…,…
13.0852,1
8.04306,1
-95.329445,1
-82.167778,1


dplo1,count
f64,u32
,1614372
2.351389,21586
12.482778,12439
13.383333,11784
-0.1275,10120
…,…
9.05611,1
86.7715,1
-0.327222,1
7.6975,1


bpla1,count
f64,u32
,587627
48.856945,17385
52.516666,16240
40.700001,15232
51.507221,14229
…,…
48.8769,1
51.8839,1
44.605278,1
29.253611,1


dpla1,count
f64,u32
,1614372
48.856945,21490
41.893055,12454
52.516666,11717
51.507221,10130
…,…
52.339699,1
48.938599,1
54.671299,1
47.858891,1


pantheon_1,count
i64,u32
0,2280027
1,11790


level3_all_occ,count
str,u32
"""D:_football_P:…",32329
"""D:_P:""",15417
"""D:_politician_…",13686
"""D:_football_P:…",12250
"""D:_cricket_P:_…",11295
…,…
"""D:_ski_P:_ ski…",1
"""D:_ski_P:_ ski…",1
"""D:_ski_skier_P…",1
"""D:_artist_P:_ …",1
