<div align='center'><strong>Wikipedia Pageviews Project</strong>
<br />
<i>Netanel Madmoni</i>
</div>

----------------

# Introduction

In [1]:
#imports
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
from tqdm import tqdm
from functools import reduce
from itertools import product
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
import json
import seaborn as sns
from rich import print

load_dotenv()

RAW_DATA_DIR = Path(os.getenv('RAW_DATA_DIR'))
PROCESSED_DATA_DIR = Path(os.getenv('PROCESSED_DATA_DIR'))

# Raw Data Acquisition

1. Get List of people on Wikipedia
```sql
SELECT DISTINCT en_wiki -- page title name in english wikipedia	
FROM  `rising-theater-416315.wikipedia_pageviews.wikidata`,
      UNNEST(instance_of) AS instance_of_struct

    
WHERE instance_of_struct.numeric_id = 5 -- instance_of = 5 => person
```

2. Get pagevoew data for those people

      ```sql
      SELECT title, DATETIME_TRUNC(datehour, MONTH) AS month, SUM(views) AS monthly_views

                  
      FROM  `rising-theater-416315.wikipedia_pageviews.pageviews_2023` a
            JOIN `rising-theater-416315.data_for_project.distinct_people` b
            ON a.title = b.en_wiki

      
      WHERE datehour IS NOT NULL
      AND wiki = "en"

      GROUP BY title, DATETIME_TRUNC(datehour, MONTH)
      ```

3. Get wikidata for those people
      ```sql
      SELECT *	
      FROM  `rising-theater-416315.wikipedia_pageviews.wikidata`,
      UNNEST(instance_of) AS instance_of_struct

    
      WHERE instance_of_struct.numeric_id = 5
      ```

# Data transformation

## Views Data

Are organized in csv files, one per year.

In [6]:
# Read raw data
dfs = []
for file in RAW_DATA_DIR.glob(r'monthly_views_*.csv'):
    print(f'{file.name} - {os.stat(file).st_size / 1024 ** 2:.2f} MB)')
    dfs.append(pl.read_csv(file))

print(f'Total files: {len(dfs)}')

monthly_views_2015.csv - 411.06 MB)
monthly_views_2016.csv - 658.91 MB)
monthly_views_2019.csv - 803.42 MB)
monthly_views_2020.csv - 851.54 MB)
monthly_views_2017.csv - 708.39 MB)
monthly_views_2022.csv - 949.62 MB)
monthly_views_2023.csv - 980.47 MB)
monthly_views_2021.csv - 904.32 MB)
monthly_views_2018.csv - 755.98 MB)
Total files: 0


In [None]:
# Pivot
transformed_dfs = []
for df in tqdm(dfs):
    transformed_df = (df.with_columns(pl.col('month')
                                    .map_elements(lambda s: f'{s[:4]}-{s[5:7]}')
                                    .alias('year_month'))
                      .pivot(index='title', columns='year_month', values='monthly_views')
    )
    transformed_dfs.append(transformed_df)

In [None]:
# Join
all_names = pl.concat([df.select('title') for df in dfs]).unique()
print(f'{len(all_names):,}')
views_df = all_names
for i, df in tqdm(enumerate(transformed_dfs)):
    views_df = df.join(views_df, 'title', 'outer_coalesce')
views_df = views_df.select(['title', *sorted([c for c in views_df.columns if c != 'title'])])
views_df.write_parquet(f'{PROCESSED_DATA_DIR}/pageviews_all.parquet')

views_df

## Entity Data

Are organized in json files.

In [9]:
info_files = (RAW_DATA_DIR / 'wikidata').glob('*')
len(list(info_files))

400

For example...

In [86]:
info_files = (RAW_DATA_DIR / 'wikidata').glob('*')
with open(next(info_files)) as f:
    for line in f:
        d = json.loads(line)
        sites = [sitelink['site'] for sitelink in d['sitelinks']]
        if 'enwiki' in sites:
            print(d)
            break
    

In [2]:
try:
    info_df = pl.read_parquet(PROCESSED_DATA_DIR / 'wikidata_all.parquet')
except FileNotFoundError:
    info_list = []
    info_files = (RAW_DATA_DIR / 'wikidata').glob('*')
    for file in info_files:
        with open(file) as f:
            for line in f:
                d = json.loads(line)
                sites = [sitelink['site'] for sitelink in d['sitelinks']]
                if 'enwiki' in sites:
                    info_list.append(d)
    info_df = pl.DataFrame(info_list)
    
info_df.head()

id,numeric_id,en_label,en_wiki,en_description,type,sitelinks,descriptions,labels,aliases,instance_of,gender,date_of_birth,date_of_death,worked_at,country_of_citizenship,country,educated_at,occupation,instrument,genre,industry,subclass_of,coordinate_location,iso_3166_alpha3,member_of,from_fictional_universe
str,str,str,str,str,str,list[struct[3]],list[struct[2]],list[struct[2]],list[struct[2]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[2]],list[null],list[struct[1]],list[struct[1]]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""item""","[{""dewiki"",""Albrecht Joseph"",""Albrecht_Joseph""}, {""enwiki"",""Albrecht Joseph"",""Albrecht_Joseph""}, … {""fawiki"",""آلبرشت یوزف"",""آلبرشت_یوزف""}]","[{""de"",""deutsch-amerikanischer Theater- und Filmschaffender""}, {""fa"",""فیلمنامه‌نویس، تدوینگر، و نویسنده آلمانی""}, … {""sv"",""tysk författare och filmklippare""}]","[{""de"",""Albrecht Joseph""}, {""fr"",""Albrecht Joseph""}, … {""ru"",""Элбрехт Джозеф""}]","[{""de"",""Al Joseph""}, {""en"",""Al Joseph""}, … {""sk"",""Al Joseph""}]","[{""5""}]","[{""6581097""}]","[{""+1901-11-20T00:00:00Z""}]","[{""+1901-11-20T00:00:00Z""}]",[],"[{""183""}]",[],[],"[{""28389""}, {""7042855""}, {""36180""}]",[],[],[],[],[],[],[],[]
"""Q100707809""","""100707809""","""Robert Cahaly""","""Robert_Cahaly""","""American polls…","""item""","[{""enwiki"",""Robert Cahaly"",""Robert_Cahaly""}, {""jawiki"",""ロバート・カヘリー"",""ロバート・カヘリー""}]","[{""en"",""American pollster and political consultant""}, {""zh"",""美国民调专家""}]","[{""en"",""Robert Cahaly""}, {""de"",""Robert Cahaly""}, … {""pt-br"",""Robert Cahaly""}]",[],"[{""5""}]","[{""6581097""}]",[],[],"[{""100704905""}]","[{""30""}]",[],"[{""1024426""}]","[{""16919156""}, {""8125919""}]",[],[],[],[],[],[],[],[]
"""Q8988407""","""8988407""","""Li Fei""","""Lei_Fei""","""actor, busines…","""item""","[{""zhwiki"",""李菲"",""李菲""}, {""enwiki"",""Lei Fei"",""Lei_Fei""}]","[{""nl"",""ondernemer""}, {""en"",""actor, businesswoman, and wushu taolu athlete from Macau""}]","[{""zh"",""李菲""}, {""en"",""Li Fei""}, … {""sq"",""Jewel Lee""}]","[{""en"",""Jewel Lee""}]","[{""5""}]","[{""6581072""}]",[],[],[],[],[],[],"[{""43845""}]",[],[],[],[],[],[],[],[]
"""Q3588237""","""3588237""","""Yang Ti-liang""","""Yang_Ti-liang""","""Hong Kong judg…","""item""","[{""enwiki"",""Yang Ti-liang"",""Yang_Ti-liang""}, {""zhwiki"",""楊鐵樑"",""楊鐵樑""}, … {""kowiki"",""양톄량"",""양톄량""}]","[{""en"",""Hong Kong judge (1929–2023)""}, {""nl"",""rechter uit Brunei""}, … {""ko"",""홍콩의 판사 (1929–2023)""}]","[{""en"",""Yang Ti-liang""}, {""en-ca"",""Yang Ti-liang""}, … {""pap"",""Yang Ti-liang""}]","[{""en"",""Sir Ti-liang Yang""}, {""ko"",""양티량""}, {""ko"",""양철량""}]","[{""5""}]","[{""6581097""}]","[{""+1929-06-30T00:00:00Z""}]","[{""+1929-06-30T00:00:00Z""}]",[],"[{""148""}]",[],"[{""1639978""}, {""193196""}]","[{""16533""}, {""329455""}, {""82955""}]",[],[],[],[],[],[],"[{""157412""}]",[]
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""item""","[{""itwiki"",""Mauro Sérgio Viriato Mendes"",""Mauro_Sérgio_Viriato_Mendes""}, {""plwiki"",""Mauro Sérgio Viriato Mendes"",""Mauro_Sérgio_Viriato_Mendes""}, … {""trwiki"",""Maurinho (1978 doğumlu futbolcu)"",""Maurinho_(1978_doğumlu_futbolcu)""}]","[{""it"",""calciatore brasiliano""}, {""fr"",""joueur de football brésilien""}, … {""mos"",""Bal tãongra""}]","[{""it"",""Mauro Sérgio Viriato Mendes""}, {""en"",""Mauro Sérgio Viriato Mendes""}, … {""tr"",""Maurinho (1978 doğumlu futbolcu)""}]","[{""es"",""Mauro Sergio Viriato Mendes""}, {""es"",""Maurinho""}, … {""fr"",""Maurinho""}]","[{""5""}]","[{""6581097""}]","[{""+1978-10-11T00:00:00Z""}]","[{""+1978-10-11T00:00:00Z""}]",[],"[{""155""}]",[],[],"[{""937857""}]",[],[],[],[],[],[],[],[]


Clean info dataframe

In [3]:
print(dict(zip(info_df.columns, info_df.dtypes)))

**Questions for cleaning the data**
1. Can `en_wiki` be a primary key instead of the id columns, in order to match this table to the pageviews one?
1. Are there any columns that contain nothing but empty lists or null values? If so - drop them.
1. Are there any list/struct columns that always contain a single value? If so - flatten them
1. How many unique values does the column `type` contain? If 1 - redundant.
1. Can an entity only have one gender? If so - convert from list of structs to a simple string (gender code). Also, can an entity have no gender? If so - why?

In [51]:
print(info_df['en_wiki'].n_unique(), info_df.height)

In [53]:
info_df.filter(info_df['en_wiki'].is_duplicated()).sort('en_wiki')

id,numeric_id,en_label,en_wiki,en_description,type,sitelinks,descriptions,labels,aliases,instance_of,gender,date_of_birth,date_of_death,worked_at,country_of_citizenship,country,educated_at,occupation,instrument,genre,industry,subclass_of,coordinate_location,iso_3166_alpha3,member_of,from_fictional_universe
str,str,str,str,str,str,list[struct[3]],list[struct[2]],list[struct[2]],list[struct[2]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[2]],list[null],list[struct[1]],list[struct[1]]
"""Q125017311""","""125017311""","""Ashutosh""","""Ashutosh_(spir…",,"""item""","[{""enwiki"",""Ashutosh (spiritual leader)"",""Ashutosh_(spiritual_leader)""}]",[],"[{""en"",""Ashutosh""}]",[],"[{""5""}]","[{""6581097""}]","[{""+1946-01-01T00:00:00Z""}]","[{""+1946-01-01T00:00:00Z""}]",[],[],[],[],[],[],[],[],[],[],[],[],[]
"""Q17488903""","""17488903""","""Ashutosh""","""Ashutosh_(spir…","""Indian spiritu…","""item""","[{""enwiki"",""Ashutosh (spiritual leader)"",""Ashutosh_(spiritual_leader)""}, {""hiwiki"",""आशुतोष (आध्यात्मिक नेता)"",""आशुतोष_(आध्यात्मिक_नेता)""}]","[{""en"",""Indian spiritual leader""}, {""hi"",""भारतीय आध्यात्मिक गुरु""}]","[{""en"",""Ashutosh""}, {""fr"",""Ashutosh""}, … {""hi"",""आशुतोष (आध्यात्मिक नेता)""}]","[{""en"",""Ashutosh Maharaj""}, {""en"",""Mahesh Kumar Jha""}]","[{""5""}]","[{""6581097""}]","[{""+1946-01-01T00:00:00Z""}]","[{""+1946-01-01T00:00:00Z""}]",[],"[{""668""}, {""129286""}, {""1775277""}]",[],[],"[{""64711780""}, {""484260""}]",[],[],[],[],[],[],[],[]
"""Q99237902""","""99237902""","""Ferdinand van …","""Ferdinand_van_…","""Italian noblem…","""item""","[{""enwiki"",""Ferdinand van den Eynde, 1st Marquess of Castelnuovo"",""Ferdinand_van_den_Eynde,_1st_Marquess_of_Castelnuovo""}, {""nlwiki"",""Ferdinand van den Eynde"",""Ferdinand_van_den_Eynde""}, … {""frwiki"",""Ferdinand van den Eynde (1er marquis de Castelnuovo)"",""Ferdinand_van_den_Eynde_(1er_marquis_de_Castelnuovo)""}]","[{""en"",""Italian nobleman and magnate of Flemish origin (d.1674)""}, {""fr"",""noble italien""}, {""it"",""nobile, mercante, collezionista d'arte e mecenate italiano""}]","[{""en"",""Ferdinand van den Eynde, 1st Marquess of Castelnuovo""}, {""nl"",""Ferdinand van den Eynde, 1st Marquess of Castelnuovo""}, … {""fr"",""Ferdinand van den Eynde""}]","[{""en"",""Ferdinand van den Eynde""}, {""en"",""Ferdinando Vandeneynden""}, {""nl"",""Ferdinand van den Eynde""}]","[{""5""}, {""5""}]","[{""6581097""}]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
"""Q99237902""","""99237902""","""Ferdinand van …","""Ferdinand_van_…","""Italian noblem…","""item""","[{""enwiki"",""Ferdinand van den Eynde, 1st Marquess of Castelnuovo"",""Ferdinand_van_den_Eynde,_1st_Marquess_of_Castelnuovo""}, {""nlwiki"",""Ferdinand van den Eynde"",""Ferdinand_van_den_Eynde""}, … {""frwiki"",""Ferdinand van den Eynde (1er marquis de Castelnuovo)"",""Ferdinand_van_den_Eynde_(1er_marquis_de_Castelnuovo)""}]","[{""en"",""Italian nobleman and magnate of Flemish origin (d.1674)""}, {""fr"",""noble italien""}, {""it"",""nobile, mercante, collezionista d'arte e mecenate italiano""}]","[{""en"",""Ferdinand van den Eynde, 1st Marquess of Castelnuovo""}, {""nl"",""Ferdinand van den Eynde, 1st Marquess of Castelnuovo""}, … {""fr"",""Ferdinand van den Eynde""}]","[{""en"",""Ferdinand van den Eynde""}, {""en"",""Ferdinando Vandeneynden""}, {""nl"",""Ferdinand van den Eynde""}]","[{""5""}, {""5""}]","[{""6581097""}]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
"""Q5638516""","""5638516""","""Hagai Shaham""","""Hagai_Shaham""","""Israeli violin…","""item""","[{""enwiki"",""Hagai Shaham"",""Hagai_Shaham""}, {""jawiki"",""ハガイ・シャハム"",""ハガイ・シャハム""}, … {""arzwiki"",""حجاى شاحام"",""حجاى_شاحام""}]","[{""cs"",""izraelský houslista""}, {""en"",""Israeli violinist""}, … {""fi"",""israelilainen viulisti""}]","[{""en"",""Hagai Shaham""}, {""he"",""חגי שחם""}, … {""arz"",""حجاى شاحام""}]","[{""en"",""Prof. Hagai Shaham""}]","[{""5""}, {""5""}]","[{""6581097""}]","[{""+1966-07-08T00:00:00Z""}]","[{""+1966-07-08T00:00:00Z""}]","[{""4614""}, {""12404543""}]","[{""801""}]",[],[],"[{""16145150""}, {""1259917""}, {""1622272""}]","[{""8355""}]","[{""9730""}]",[],[],[],[],[],[]
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Q107124843""","""107124843""","""Rene Villa""","""René_Villa""","""Costa Rican si…","""item""","[{""enwiki"",""René Villa"",""René_Villa""}]","[{""en"",""Costa Rican singer""}, {""es"",""cantante, compositor y modelo costarricense""}, … {""ga"",""Amhránaí Costa Rican""}]","[{""en"",""Rene Villa""}, {""es"",""René Villa""}, … {""bn"",""রেনে ভিলা""}]","[{""en"",""José René Villafuerte Flores""}, {""en"",""Rene Flores""}, … {""ga"",""José René Villafuerte Flores""}]","[{""5""}, {""5""}]","[{""6581097""}]","[{""+1997-01-26T00:00:00Z""}]","[{""+1997-01-26T00:00:00Z""}]",[],"[{""800""}, {""27""}]",[],[],"[{""488205""}, {""4610556""}, … {""177220""}]","[{""17172850""}]",[],[],[],[],[],[],[]
"""Q7791373""","""7791373""","""Thomas Jones""","""Thomas_Jones_(…","""Welsh civil se…","""item""","[{""cywiki"",""Thomas Jones (1870-1955)"",""Thomas_Jones_(1870-1955)""}, {""enwiki"",""Thomas Jones (civil servant)"",""Thomas_Jones_(civil_servant)""}, … {""enwikiquote"",""Thomas Jones (civil servant)"",""Thomas_Jones_(civil_servant)""}]","[{""en"",""Welsh civil servant and educationalist (1870–1955)""}, {""de"",""britischer Pädagoge, geboren 1870""}, … {""ryu"",""インチリーぬ政治家""}]","[{""cy"",""Thomas Jones""}, {""en"",""Thomas Jones""}, … {""ga"",""Thomas Jones""}]","[{""cy"",""Thomas Jones (T. J.)""}, {""en"",""Tom Jones""}, … {""nl"",""Dr. Thomas Jones""}]","[{""5""}, {""5""}]","[{""6581097""}]","[{""+1870-09-27T00:00:00Z""}]","[{""+1870-09-27T00:00:00Z""}]",[],"[{""145""}, {""25""}, {""174193""}]",[],"[{""6537042""}]","[{""1231865""}, {""212238""}, {""82955""}]",[],[],[],[],[],[],[],[]
"""Q7791373""","""7791373""","""Thomas Jones""","""Thomas_Jones_(…","""Welsh civil se…","""item""","[{""cywiki"",""Thomas Jones (1870-1955)"",""Thomas_Jones_(1870-1955)""}, {""enwiki"",""Thomas Jones (civil servant)"",""Thomas_Jones_(civil_servant)""}, … {""enwikiquote"",""Thomas Jones (civil servant)"",""Thomas_Jones_(civil_servant)""}]","[{""en"",""Welsh civil servant and educationalist (1870–1955)""}, {""de"",""britischer Pädagoge, geboren 1870""}, … {""ryu"",""インチリーぬ政治家""}]","[{""cy"",""Thomas Jones""}, {""en"",""Thomas Jones""}, … {""ga"",""Thomas Jones""}]","[{""cy"",""Thomas Jones (T. J.)""}, {""en"",""Tom Jones""}, … {""nl"",""Dr. Thomas Jones""}]","[{""5""}, {""5""}]","[{""6581097""}]","[{""+1870-09-27T00:00:00Z""}]","[{""+1870-09-27T00:00:00Z""}]",[],"[{""145""}, {""25""}, {""174193""}]",[],"[{""6537042""}]","[{""1231865""}, {""212238""}, {""82955""}]",[],[],[],[],[],[],[],[]
"""Q106232513""","""106232513""","""Oscar Castella…","""Óscar_Castella…","""Guatemalan foo…","""item""","[{""plwiki"",""Oscar Castellanos"",""Oscar_Castellanos""}, {""enwiki"",""Óscar Castellanos"",""Óscar_Castellanos""}, {""eswiki"",""Óscar Antonio Castellanos"",""Óscar_Antonio_Castellanos""}]","[{""pl"",""gwatemalski piłkarz""}, {""en"",""Guatemalan footballer""}, … {""ar"",""لاعب كرة قدم غواتيمالي""}]","[{""pl"",""Oscar Castellanos""}, {""nl"",""Oscar Castellanos""}, … {""pt-br"",""Óscar Castellanos""}]","[{""en"",""Óscar Castellanos""}, {""nl"",""Óscar Castellanos""}]","[{""5""}, {""5""}]","[{""6581097""}]","[{""+2000-01-01T00:00:00Z""}]","[{""+2000-01-01T00:00:00Z""}]",[],[],[],[],"[{""937857""}]",[],[],[],[],[],[],[],[]


In [24]:
# For each list column, view distribution of list lengths 
list_columns = [c for c in info_df.columns if info_df[c].dtype == pl.List]
for column in list_columns:
    print(column, info_df[column].map_elements(len).rename(f'length of {column}').value_counts())

In [6]:
info_df['type'].n_unique()

1

In [16]:
info_df['gender'].map_elements(len).value_counts()

gender,count
i64,u32
3,15
0,302
1,1992615
2,524
4,3


**Observations:**
There are a few types of columns:
1. Redundant columns: columns that don't add information, either because they contain the same value throughout, have many missing values or do not contribute to our specific needs. The columns are:
    - `type` (same value for all rows)
    - `numeric_id` (contained in `id`)
    - `descriptions` (unneeded information)
    - `labels` (unneeded information)
    - `coordinate_location`(>99% empty)
    - `country` (according to the Wikidata site, this property should not be used for humans)
    - `date_of_death` (has wrong values)
These columns will be dropped.
2. Columns with a single value per row: columns that (should) have zero or one values for each row. These columns are:
    - `id`
    - `en_label`
    - `en_wiki`
    - `en_description`
    - `date of birth`
These columns will be... 
3. Columns with a few *'interesting'* values per row: columns that contain zero, one or more values for each row, where every value is *interesting* for our purpose. These columns are:
    - `instance_of`
    - `gender` (apparently an entity can have multiple genders)
4. Columns that have multiple values per row: columns that have multiple values per row, where only an aggregation of these values is *interesting* for our purposes. These columns are:
    - `sitelinks`
    - `aliases`
------

    - `instrument`
    - `worked_at`
    - `country_of_citizenship`
    - `educated_at`
    - `occupation`
    - `genre`
    - `industry`
    - `subclass_of`
    - `member_of`
    - `from_fictional_universe`


- `sitelinks` contains the name of this page in all relevant wikimedia sites
    - Convert this to binary columns: `exists_in_{sitename}`
    - Also create an aggregated column: `num_sitelinks`
- `descriptions`, `labels`
    - I will drop these columns
- `aliases`
    - I will convert this to numeric columns: `num_aliases_in_{sitename}`
- `instance_of`
    -  will convert this to binary columns: `is_instance_of_{parent}`
- `gender` - not always a single one.
    - I will convert this to binary columns: `has_gender_{gender}`
- `date of birth`, `date of death`:
    - Convert to timestamps
- All one-dimensional struct columns:
    - flatten
- `iso_3166_alpha3`
    - drop


In [43]:
def unnest_list_of_structs(l: list[pl.Struct]):
    '''
    Turn a list of structs into a list of the first value of each struct.
    For example:
    `[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}] --> [1, 3]`
    '''
    first_key = l[0].fields[0].name
    return []

info_df['sitelinks'].list.eval(pl.element().struct[0])

sitelinks
list[str]
"[""dewiki"", ""enwiki"", … ""fawiki""]"
"[""enwiki"", ""jawiki""]"
"[""zhwiki"", ""enwiki""]"
"[""enwiki"", ""zhwiki"", … ""kowiki""]"
"[""itwiki"", ""plwiki"", … ""trwiki""]"
…
"[""enwiki"", ""nowiki"", … ""nlwiki""]"
"[""frwiki"", ""kowiki"", … ""anpwiki""]"
"[""frwiki"", ""fawiki"", … ""ruwiki""]"
"[""enwiki"", ""azbwiki""]"


In [46]:
def clean_info_df(df: pl.DataFrame):
    columns_to_drop = ['type', 'descriptions', 'labels', 'iso_3166_alpha3', 'coordinate_location']
    struct_columns = [c for c in df.columns if df[c].dtype == pl.List and c not in columns_to_drop]
    columns_to_encode = [c for c in df.columns if df[c].dtype == pl.List and c not in
                         [*columns_to_drop, 'date_of_birth', 'date_of_death']]
    
    df = (df
          .drop(columns_to_drop) # drop columns
          .with_columns([pl.col(col).list.eval(pl.element().struct[0]) for col in struct_columns]) # unnest structs
    )
    
    unique_sitelinks = df['sitelinks'].explode().unique()
    unique_parents = df['instance_of'].explode().unique()
    unique_
    
    sitelinks_columns = [pl.col('sitelinks').list.contains(site).alias(f'is_in_{site}') for site in unique_sitelinks]
    parents_columns = [pl.col('instance_of').list.contains(parent).alias(f'is_instance_of_{parent}') for parent in unique_parents]
    
    
    df = (df
          .with_columns([
           *sitelinks_columns,
           pl.col('aliases').list.len().alias('num_aliases'),
           *parents_columns,
           
          ]))
    
    return df

clean_info_df(info_df.head(5)).explode('sitelinks').pivot(index='id', )

id,numeric_id,en_label,en_wiki,en_description,sitelinks,aliases,instance_of,gender,date_of_birth,date_of_death,worked_at,country_of_citizenship,country,educated_at,occupation,instrument,genre,industry,subclass_of,coordinate_location,member_of,from_fictional_universe
str,str,str,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[f64],list[str],list[str]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""dewiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""enwiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""frwiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""arwiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""arzwiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""enwiki""","[""es"", ""es"", … ""fr""]","[""5""]","[""6581097""]","[""+1978-10-11T00:00:00Z""]","[""+1978-10-11T00:00:00Z""]",[],"[""155""]",[],[],"[""937857""]",[],[],[],[],[],[],[]
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""arzwiki""","[""es"", ""es"", … ""fr""]","[""5""]","[""6581097""]","[""+1978-10-11T00:00:00Z""]","[""+1978-10-11T00:00:00Z""]",[],"[""155""]",[],[],"[""937857""]",[],[],[],[],[],[],[]
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""ptwiki""","[""es"", ""es"", … ""fr""]","[""5""]","[""6581097""]","[""+1978-10-11T00:00:00Z""]","[""+1978-10-11T00:00:00Z""]",[],"[""155""]",[],[],"[""937857""]",[],[],[],[],[],[],[]
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""arwiki""","[""es"", ""es"", … ""fr""]","[""5""]","[""6581097""]","[""+1978-10-11T00:00:00Z""]","[""+1978-10-11T00:00:00Z""]",[],"[""155""]",[],[],"[""937857""]",[],[],[],[],[],[],[]


In [96]:
info_df.write_parquet(PROCESSED_DATA_DIR / 'wikidata_all.parquet')

Aliases not included in the pageviews data

Convert code to label description:
- https://stackoverflow.com/questions/59737076/how-to-get-a-label-of-a-property-from-wikidata
- https://stackoverflow.com/questions/72704205/how-to-convert-wiki-data-qid-to-entity-and-vice-versa-in-python

In [72]:
with pl.Config(tbl_rows=500):
    display(info_df.filter(pl.col('gender').list.len() > 1).transpose(include_header=True))

column,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,…,column_505,column_506,column_507,column_508,column_509,column_510,column_511,column_512,column_513,column_514,column_515,column_516,column_517,column_518,column_519,column_520,column_521,column_522,column_523,column_524,column_525,column_526,column_527,column_528,column_529,column_530,column_531,column_532,column_533,column_534,column_535,column_536,column_537,column_538,column_539,column_540,column_541
str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],…,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""id""","[""Q7800135""]","[""Q108381040""]","[""Q55397010""]","[""Q5214843""]","[""Q2994979""]","[""Q107472755""]","[""Q4859279""]","[""Q8051479""]","[""Q1294046""]","[""Q18686095""]","[""Q7116433""]","[""Q55834726""]","[""Q20035643""]","[""Q106785836""]","[""Q7857048""]","[""Q85199656""]","[""Q8023575""]","[""Q3492524""]","[""Q89372815""]","[""Q11467933""]","[""Q708445""]","[""Q55731552""]","[""Q62129117""]","[""Q16300043""]","[""Q100994887""]","[""Q124218234""]","[""Q6256128""]","[""Q1347851""]","[""Q18518779""]","[""Q21062531""]","[""Q114632978""]","[""Q96694248""]","[""Q11548763""]","[""Q106095215""]","[""Q41173""]","[""Q89377845""]",…,"[""Q117552132""]","[""Q6653592""]","[""Q20528796""]","[""Q16731062""]","[""Q20712237""]","[""Q59171056""]","[""Q788298""]","[""Q55314957""]","[""Q29908714""]","[""Q65042596""]","[""Q107118557""]","[""Q275025""]","[""Q706927""]","[""Q83689574""]","[""Q1281867""]","[""Q7802962""]","[""Q117425346""]","[""Q19367437""]","[""Q107118477""]","[""Q704375""]","[""Q20508909""]","[""Q53951730""]","[""Q84429493""]","[""Q27244414""]","[""Q68631381""]","[""Q24700929""]","[""Q6574985""]","[""Q20683861""]","[""Q84196087""]","[""Q13129290""]","[""Q7098066""]","[""Q1033604""]","[""Q1368840""]","[""Q715407""]","[""Q97930223""]","[""Q21062831""]","[""Q42291398""]"
"""numeric_id""","[""7800135""]","[""108381040""]","[""55397010""]","[""5214843""]","[""2994979""]","[""107472755""]","[""4859279""]","[""8051479""]","[""1294046""]","[""18686095""]","[""7116433""]","[""55834726""]","[""20035643""]","[""106785836""]","[""7857048""]","[""85199656""]","[""8023575""]","[""3492524""]","[""89372815""]","[""11467933""]","[""708445""]","[""55731552""]","[""62129117""]","[""16300043""]","[""100994887""]","[""124218234""]","[""6256128""]","[""1347851""]","[""18518779""]","[""21062531""]","[""114632978""]","[""96694248""]","[""11548763""]","[""106095215""]","[""41173""]","[""89377845""]",…,"[""117552132""]","[""6653592""]","[""20528796""]","[""16731062""]","[""20712237""]","[""59171056""]","[""788298""]","[""55314957""]","[""29908714""]","[""65042596""]","[""107118557""]","[""275025""]","[""706927""]","[""83689574""]","[""1281867""]","[""7802962""]","[""117425346""]","[""19367437""]","[""107118477""]","[""704375""]","[""20508909""]","[""53951730""]","[""84429493""]","[""27244414""]","[""68631381""]","[""24700929""]","[""6574985""]","[""20683861""]","[""84196087""]","[""13129290""]","[""7098066""]","[""1033604""]","[""1368840""]","[""715407""]","[""97930223""]","[""21062831""]","[""42291398""]"
"""en_label""","[""Tian Lingzi""]","[""Devon Price""]","[""Rivers Solomon""]","[""Dana Stone""]","[""Constantine Gongyles""]","[""Marietu Tenuche""]","[""Barbara J. Sahakian""]","[""Yelena Komarova""]","[""James Barry""]","[""Miss Fame""]","[""Ozaawindib""]","[""Joss Favela""]","[""Ebele Oseye""]","[""Gray Johnson Poole""]","[""Tutu Chengcui""]","[""Lori Campbell""]","[""Wimund""]","[""B-Complex""]","[""Mathilde Stuyvesant""]","[""Nao-cola Yamazaki""]","[""Ivy Ling Po""]","[""Kristian Ranđelović""]","[""Laurin Hendrix""]","[""Pekka J. Korvenheimo""]","[""Maya the drag queen""]","[""Leilani Tominiko""]","[""John/Eleanor Rykener""]","[""John the Orphanotrophos""]","[""Suruli Manohar""]","[""N.D. Stevenson""]","[""Romaizah binti Haji Mohd Salleh""]","[""Brendan Allen""]","[""Yōko Mizuki""]","[""Anton Kryzhanovsky""]","[""Demi Lovato""]","[""Angela Miri""]",…,"[""Iz Hesketh""]","[""Liu Jishu""]","[""Romi Mankin""]","[""Janae Kroc""]","[""Conn O'Neill""]","[""Elisa Rae Shupe""]","[""Ira Schneider""]","[""Joan Rosenbaum""]","[""Gopi Shankar Madurai""]","[""Narthaki Nataraj""]","[""To’oto’oali’I Roger Stanley""]","[""Yumjaagiin Tsedenbal""]","[""Cao Teng""]","[""Jean Hewitt""]","[""Nicetas I of Constantinople""]","[""Tilsa Tsuchiya""]","[""Isabel Ruffell""]","[""Thomas(ine) Hall""]","[""Ymania Brown""]","[""Wang Zhen""]","[""Gohar Muradyan""]","[""Tunde Olaniran""]","[""Flor Amargo""]","[""SJ Sindu""]","[""River Gallo""]","[""Liu Chenggui""]","[""Tessa Violet""]","[""Ruth Baldacchino""]","[""Debbie Farhat""]","[""J. Beverley Smith""]","[""Ophrah Shemesh""]","[""Béla Hankó""]","[""Essie Summers""]","[""Le Van Duyet""]","[""Andrea Lawlor""]","[""Sheena Metal""]","[""Nadira Ilana""]"
"""en_wiki""","[""Tian_Lingzi""]","[""Devon_Price""]","[""Rivers_Solomon""]","[""Dana_Stone""]","[""Constantine_Gongyles""]","[""Marietu_Tenuche""]","[""Barbara_Sahakian""]","[""Yelena_Komarova""]","[""James_Barry_(surgeon)""]","[""Miss_Fame""]","[""Ozaawindib""]","[""Joss_Favela""]","[""Ebele_Oseye""]","[""Gray_Johnson_Poole""]","[""Tutu_Chengcui""]","[""Lori_Campbell""]","[""Wimund""]","[""B-Complex""]","[""Mathilde_Stuyvesant""]","[""Nao-Cola_Yamazaki""]","[""Ivy_Ling_Po""]","[""Kristian_Ranđelović""]","[""Laurin_Hendrix""]","[""Pekka_J._Korvenheimo""]","[""Maya_the_Drag_Queen""]","[""Leilani_Tominiko""]","[""John/Eleanor_Rykener""]","[""John_the_Orphanotrophos""]","[""Suruli_Manohar""]","[""ND_Stevenson""]","[""Romaizah_Mohd_Salleh""]","[""Brendan_Allen""]","[""Yoko_Mizuki""]","[""Anton_Krzyzanowski""]","[""Demi_Lovato""]","[""Angela_Miri""]",…,"[""Iz_Hesketh""]","[""Liu_Jishu""]","[""Romi_Mankin""]","[""Janae_Kroc""]","[""Conn_O'Neill_(prisoner)""]","[""Elisa_Rae_Shupe""]","[""Ira_Schneider""]","[""Joan_Rosenbaum""]","[""Gopi_Shankar_Madurai""]","[""Narthaki_Nataraj""]","[""Toʻotoʻoaliʻi_Roger_Stanley""]","[""Yumjaagiin_Tsedenbal""]","[""Cao_Teng""]","[""Jean_Hewitt""]","[""Nicetas_I_of_Constantinople""]","[""Tilsa_Tsuchiya""]","[""Isabel_Ruffell""]","[""Thomas(ine)_Hall""]","[""Ymania_Brown""]","[""Wang_Zhen_(eunuch)""]","[""Gohar_Muradyan""]","[""Tunde_Olaniran""]","[""Flor_Amargo""]","[""S._J._Sindu""]","[""River_Gallo""]","[""Liu_Chenggui""]","[""Tessa_Violet""]","[""Ruth_Baldacchino""]","[""Debbie_Farhat""]","[""J._Beverley_Smith""]","[""Ophrah_Shemesh""]","[""Béla_Hankó""]","[""Essie_Summers""]","[""Lê_Văn_Duyệt""]","[""Andrea_Lawlor""]","[""Sheena_Metal""]","[""Nadira_Ilana""]"
"""en_description…","[""Eunuch who had a stranglehold on power due to his close personal relationship with Emperor Xizong""]","[""American social psychologist and writer""]","[""American science fiction author (1989-)""]","[""American photojournalist (1939-1970)""]","[""Byzantine aristocrat and official""]","[""Nigerian academic""]","[""neuropsychologist""]","[""Olympic wrestler""]","[""19th century British surgeon known for medical reforms and personal life""]","[""American drag queen""]","[""19th-century Ojibwa warrior described as an ayaakwe""]","[""Mexican singer-songwriter""]","[""poet and fiction writer""]",,"[""eunuch during the reign of Emperor Xianzong of Tang""]","[""Cree-Métis educator""]","[""English bishop and warlord""]","[""Slovakian musician and DJ""]",,"[""Japanese novelist and essayist (1978-)""]","[""Hong Kong actress and singer""]","[""intersex transgender activist""]","[""American politician""]","[""Finnish diplomat""]","[""performer, singer, activist and drag queen""]","[""Samoan–New Zealand professional wrestler""]","[""Medieval English sex-worker""]","[""Chief Court eunuch (1000-1043)""]","[""actor""]","[""American comic writer and artist""]","[""politician in Brunei""]","[""American mixed martial artist""]","[""Japanese screenwriter (1910-2003)""]","[""intersex activist""]","[""American singer and actress""]","[""Nigerian academic""]",…,"[""English stage actor and drag queen""]","[""Commander of the Shence Armies""]","[""researcher""]","[""American powerlifter and bodybuilder""]","[""(1601-1622)""]","[""American writer""]","[""American artist""]","[""American museum curator""]","[""Indian indigenist and politician""]","[""Indian dancer""]","[""Samoan fa'afafine activist""]","[""Prime Minister and President of Mongolia (1916-1991)""]","[""2nd century Eastern Han dynasty eunuch official""]","[""english-American food writer and home economist""]","[""Patriarch of Constantinople""]","[""Peruvian painter and printmaker (died 1984)""]","[""classical philologist""]","[""English servant""]","[""trans rights activist""]","[""Ming Dynasty eunuch, died 1449""]","[""Armenian philologist, translator""]","[""musician from Flint, Michigan""]","[""Mexican musician""]","[""Sri Lankan American novelist and short story writer""]","[""American filmmaker, actor, model, and intersex rights activist""]","[""Song Dynasty official and expert on weights and measures""]","[""American singer""]","[""Maltese LGBT and intersex activist""]","[""American politician""]","[""Welsh historian and author""]","[""American-Israeli artist""]","[""Hungarian zoologist""]","[""New Zealand writer (1912-1998)""]","[""Vietnamese general, mandarin""]","[""writer and professor of English""]","[""American talk-show host""]","[""Malaysian writer, filmmaker and activist""]"
"""type""","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]",…,"[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]","[""item""]"
"""sitelinks""","[""{""enwiki"",""Tian Lingzi"",""Tian_Lingzi""}"", ""{""zhwiki"",""田令孜"",""田令孜""}"", … ""{""jawiki"",""田令孜"",""田令孜""}""]","[""{""enwiki"",""Devon Price"",""Devon_Price""}"", ""{""dewiki"",""Devon Price"",""Devon_Price""}"", … ""{""euwiki"",""Devon Price"",""Devon_Price""}""]","[""{""enwiki"",""Rivers Solomon"",""Rivers_Solomon""}"", ""{""frwiki"",""Rivers Solomon"",""Rivers_Solomon""}"", … ""{""eswiki"",""Rivers Solomon"",""Rivers_Solomon""}""]","[""{""enwiki"",""Dana Stone"",""Dana_Stone""}"", ""{""bgwiki"",""Дейна Стоун"",""Дейна_Стоун""}"", … ""{""frwiki"",""Dana Stone"",""Dana_Stone""}""]","[""{""frwiki"",""Constantin Gongylès"",""Constantin_Gongylès""}"", ""{""shwiki"",""Konstantin Gongil"",""Konstantin_Gongil""}"", … ""{""ruwiki"",""Константин Гонгил"",""Константин_Гонгил""}""]","[""{""enwiki"",""Marietu Tenuche"",""Marietu_Tenuche""}"", ""{""enwikiquote"",""Marietu Tenuche"",""Marietu_Tenuche""}"", … ""{""igwiki"",""Marietu Tenuche"",""Marietu_Tenuche""}""]","[""{""enwiki"",""Barbara Sahakian"",""Barbara_Sahakian""}"", ""{""azbwiki"",""باربارا ساکاکیان"",""باربارا_ساکاکیان""}"", … ""{""hywiki"",""Բարբարա Սահակյան"",""Բարբարա_Սահակյան""}""]","[""{""enwiki"",""Yelena Komarova"",""Yelena_Komarova""}"", ""{""plwiki"",""Ołena Komarowa"",""Ołena_Komarowa""}"", … ""{""azwiki"",""Yelena Komarova"",""Yelena_Komarova""}""]","[""{""enwiki"",""James Barry (surgeon)"",""James_Barry_(surgeon)""}"", ""{""eswiki"",""James Barry"",""James_Barry""}"", … ""{""gawiki"",""Margaret Ann Bulkley (James Barry)"",""Margaret_Ann_Bulkley_(James_Barry)""}""]","[""{""enwiki"",""Miss Fame"",""Miss_Fame""}"", ""{""nlwiki"",""Miss Fame"",""Miss_Fame""}"", … ""{""pawiki"",""ਮਿਸ ਫੇਮ"",""ਮਿਸ_ਫੇਮ""}""]","[""{""enwiki"",""Ozaawindib"",""Ozaawindib""}""]","[""{""eswiki"",""Joss Favela"",""Joss_Favela""}"", ""{""commonswiki"",""Category:Joss Favela"",""Category:Joss_Favela""}"", … ""{""enwiki"",""Joss Favela"",""Joss_Favela""}""]","[""{""enwiki"",""Ebele Oseye"",""Ebele_Oseye""}"", ""{""igwiki"",""Ebele Oseye"",""Ebele_Oseye""}""]","[""{""enwiki"",""Gray Johnson Poole"",""Gray_Johnson_Poole""}""]","[""{""enwiki"",""Tutu Chengcui"",""Tutu_Chengcui""}"", ""{""zhwiki"",""吐突承璀"",""吐突承璀""}""]","[""{""enwiki"",""Lori Campbell"",""Lori_Campbell""}""]","[""{""enwiki"",""Wimund"",""Wimund""}"", ""{""nowiki"",""Wimund"",""Wimund""}"", … ""{""eswiki"",""Wimund"",""Wimund""}""]","[""{""cswiki"",""B-Complex"",""B-Complex""}"", ""{""commonswiki"",""Category:B-Complex"",""Category:B-Complex""}"", … ""{""ukwiki"",""B-Complex"",""B-Complex""}""]","[""{""enwiki"",""Mathilde Stuyvesant"",""Mathilde_Stuyvesant""}""]","[""{""jawiki"",""山崎ナオコーラ"",""山崎ナオコーラ""}"", ""{""enwiki"",""Nao-Cola Yamazaki"",""Nao-Cola_Yamazaki""}"", ""{""cswiki"",""Nao-Cola Jamazaki"",""Nao-Cola_Jamazaki""}""]","[""{""enwiki"",""Ivy Ling Po"",""Ivy_Ling_Po""}"", ""{""frwiki"",""Ivy Ling Po"",""Ivy_Ling_Po""}"", … ""{""thwiki"",""หลิง ปัว"",""หลิง_ปัว""}""]","[""{""enwiki"",""Kristian Ranđelović"",""Kristian_Ranđelović""}"", ""{""ruwiki"",""Ранджелович, Кристиан"",""Ранджелович,_Кристиан""}"", … ""{""srwiki"",""Кристиан Ранђеловић"",""Кристиан_Ранђеловић""}""]","[""{""enwiki"",""Laurin Hendrix"",""Laurin_Hendrix""}""]","[""{""fiwiki"",""Pekka J. Korvenheimo"",""Pekka_J._Korvenheimo""}"", ""{""enwiki"",""Pekka J. Korvenheimo"",""Pekka_J._Korvenheimo""}""]","[""{""enwiki"",""Maya the Drag Queen"",""Maya_the_Drag_Queen""}"", ""{""pawiki"",""ਮਾਇਆ ਦ ਡਰੈਗ ਕੁਈਨ"",""ਮਾਇਆ_ਦ_ਡਰੈਗ_ਕੁਈਨ""}""]","[""{""enwiki"",""Leilani Tominiko"",""Leilani_Tominiko""}""]","[""{""ptwiki"",""John Rykener"",""John_Rykener""}"", ""{""trwiki"",""John Rykener"",""John_Rykener""}"", … ""{""enwiki"",""John/Eleanor Rykener"",""John/Eleanor_Rykener""}""]","[""{""dewiki"",""Johannes Orphanotrophos"",""Johannes_Orphanotrophos""}"", ""{""frwiki"",""Jean l'Orphanotrophe"",""Jean_l'Orphanotrophe""}"", … ""{""itwiki"",""Giovanni l'Orfanotrofo"",""Giovanni_l'Orfanotrofo""}""]","[""{""enwiki"",""Suruli Manohar"",""Suruli_Manohar""}"", ""{""tawiki"",""சுருளி மனோகர்"",""சுருளி_மனோகர்""}""]","[""{""arwiki"",""نويل ستيفنسون"",""نويل_ستيفنسون""}"", ""{""enwiki"",""ND Stevenson"",""ND_Stevenson""}"", … ""{""euwiki"",""ND Stevenson"",""ND_Stevenson""}""]","[""{""commonswiki"",""Category:Romaizah Mohd Salleh"",""Category:Romaizah_Mohd_Salleh""}"", ""{""enwiki"",""Romaizah Mohd Salleh"",""Romaizah_Mohd_Salleh""}""]","[""{""enwiki"",""Brendan Allen"",""Brendan_Allen""}"", ""{""ptwiki"",""Brendan Allen"",""Brendan_Allen""}"", … ""{""arwiki"",""بريندان ألين"",""بريندان_ألين""}""]","[""{""jawiki"",""水木洋子"",""水木洋子""}"", ""{""enwiki"",""Yoko Mizuki"",""Yoko_Mizuki""}"", … ""{""fawiki"",""یوکو میزوکی"",""یوکو_میزوکی""}""]","[""{""enwiki"",""Anton Krzyzanowski"",""Anton_Krzyzanowski""}""]","[""{""enwikiquote"",""Demi Lovato"",""Demi_Lovato""}"", ""{""plwikiquote"",""Demi Lovato"",""Demi_Lovato""}"", … ""{""fawikiquote"",""دمی لواتو"",""دمی_لواتو""}""]","[""{""enwiki"",""Angela Miri"",""Angela_Miri""}"", ""{""igwiki"",""Angela Miri"",""Angela_Miri""}"", … ""{""enwikiquote"",""Angela Miri"",""Angela_Miri""}""]",…,"[""{""enwiki"",""Iz Hesketh"",""Iz_Hesketh""}""]","[""{""enwiki"",""Liu Jishu"",""Liu_Jishu""}"", ""{""zhwiki"",""劉季述"",""劉季述""}"", ""{""jawiki"",""劉季述"",""劉季述""}""]","[""{""etwiki"",""Romi Mankin"",""Romi_Mankin""}"", ""{""enwiki"",""Romi Mankin"",""Romi_Mankin""}""]","[""{""plwiki"",""Janae Marie Kroc"",""Janae_Marie_Kroc""}"", ""{""cswiki"",""Janae Marie Kroc"",""Janae_Marie_Kroc""}"", … ""{""fawiki"",""جنای کروک"",""جنای_کروک""}""]","[""{""enwiki"",""Conn O'Neill (prisoner)"",""Conn_O'Neill_(prisoner)""}""]","[""{""arzwiki"",""جيمس شوبى"",""جيمس_شوبى""}"", ""{""enwiki"",""Elisa Rae Shupe"",""Elisa_Rae_Shupe""}"", … ""{""frwiki"",""Elisa Rae Shupe"",""Elisa_Rae_Shupe""}""]","[""{""dewiki"",""Ira Schneider"",""Ira_Schneider""}"", ""{""enwiki"",""Ira Schneider"",""Ira_Schneider""}"", … ""{""arzwiki"",""ايرا شنايدر"",""ايرا_شنايدر""}""]","[""{""enwiki"",""Joan Rosenbaum"",""Joan_Rosenbaum""}""]","[""{""enwiki"",""Gopi Shankar Madurai"",""Gopi_Shankar_Madurai""}"", ""{""pnbwiki"",""گوپی شنکر مدورائی"",""گوپی_شنکر_مدورائی""}"", … ""{""cawiki"",""Gopi Shankar Madurai"",""Gopi_Shankar_Madurai""}""]","[""{""tawiki"",""நர்த்தகி நடராஜ்"",""நர்த்தகி_நடராஜ்""}"", ""{""enwiki"",""Narthaki Nataraj"",""Narthaki_Nataraj""}"", … ""{""tewiki"",""నర్తకి నటరాజ్"",""నర్తకి_నటరాజ్""}""]","[""{""frwiki"",""To’oto’oali’I Roger Stanley"",""To’oto’oali’I_Roger_Stanley""}"", ""{""enwiki"",""Toʻotoʻoaliʻi Roger Stanley"",""Toʻotoʻoaliʻi_Roger_Stanley""}""]","[""{""commonswiki"",""Category:Yumjaagiin Tsedenbal"",""Category:Yumjaagiin_Tsedenbal""}"", ""{""enwiki"",""Yumjaagiin Tsedenbal"",""Yumjaagiin_Tsedenbal""}"", … ""{""kkwiki"",""Юмжагийн Цеденбал"",""Юмжагийн_Цеденбал""}""]","[""{""zhwiki"",""曹騰"",""曹騰""}"", ""{""dewiki"",""Cao Teng"",""Cao_Teng""}"", … ""{""thwiki"",""โจเท้ง"",""โจเท้ง""}""]","[""{""enwiki"",""Jean Hewitt"",""Jean_Hewitt""}""]","[""{""ptwiki"",""Nicetas I de Constantinopla"",""Nicetas_I_de_Constantinopla""}"", ""{""skwiki"",""Nikétas I."",""Nikétas_I.""}"", … ""{""cawiki"",""Nicetes I"",""Nicetes_I""}""]","[""{""enwiki"",""Tilsa Tsuchiya"",""Tilsa_Tsuchiya""}"", ""{""eswiki"",""Tilsa Tsuchiya"",""Tilsa_Tsuchiya""}"", … ""{""jawiki"",""ティルサ・ツチヤ"",""ティルサ・ツチヤ""}""]","[""{""enwiki"",""Isabel Ruffell"",""Isabel_Ruffell""}""]","[""{""enwiki"",""Thomas(ine) Hall"",""Thomas(ine)_Hall""}"", ""{""ruwiki"",""Холл, Томас(ин)"",""Холл,_Томас(ин)""}""]","[""{""enwiki"",""Ymania Brown"",""Ymania_Brown""}""]","[""{""zhwiki"",""王振"",""王振""}"", ""{""dewiki"",""Wang Zhen (Ming)"",""Wang_Zhen_(Ming)""}"", … ""{""ptwiki"",""Wang Zhen (eunuco)"",""Wang_Zhen_(eunuco)""}""]","[""{""hywiki"",""Գոհար Մուրադյան"",""Գոհար_Մուրադյան""}"", ""{""enwiki"",""Gohar Muradyan"",""Gohar_Muradyan""}"", … ""{""arzwiki"",""جوهر مراديان"",""جوهر_مراديان""}""]","[""{""enwiki"",""Tunde Olaniran"",""Tunde_Olaniran""}"", ""{""igwiki"",""Tunde Olaniran"",""Tunde_Olaniran""}""]","[""{""eswiki"",""Flor Amargo"",""Flor_Amargo""}"", ""{""frwiki"",""Flor Amargo"",""Flor_Amargo""}"", … ""{""commonswiki"",""Category:Flor Amargo"",""Category:Flor_Amargo""}""]","[""{""enwiki"",""S. J. Sindu"",""S._J._Sindu""}""]","[""{""enwiki"",""River Gallo"",""River_Gallo""}"", ""{""ruwiki"",""Галло, Ривер"",""Галло,_Ривер""}"", … ""{""arzwiki"",""ريفر جالو"",""ريفر_جالو""}""]","[""{""enwiki"",""Liu Chenggui"",""Liu_Chenggui""}"", ""{""zhwiki"",""劉承規"",""劉承規""}""]","[""{""commonswiki"",""Category:Tessa Violet"",""Category:Tessa_Violet""}"", ""{""enwiki"",""Tessa Violet"",""Tessa_Violet""}"", … ""{""ptwiki"",""Tessa Violet"",""Tessa_Violet""}""]","[""{""enwiki"",""Ruth Baldacchino"",""Ruth_Baldacchino""}"", ""{""ruwiki"",""Балдаккино, Рут"",""Балдаккино,_Рут""}"", … ""{""eowiki"",""Ruth Baldacchino"",""Ruth_Baldacchino""}""]","[""{""enwiki"",""Debbie Farhat"",""Debbie_Farhat""}""]","[""{""cywiki"",""J. Beverley Smith"",""J._Beverley_Smith""}"", ""{""enwiki"",""J. Beverley Smith"",""J._Beverley_Smith""}"", ""{""arzwiki"",""جيه بيفرلى سميث"",""جيه_بيفرلى_سميث""}""]","[""{""enwiki"",""Ophrah Shemesh"",""Ophrah_Shemesh""}"", ""{""commonswiki"",""Category:Ophrah Shemesh"",""Category:Ophrah_Shemesh""}"", ""{""arzwiki"",""اوفراه شيميش"",""اوفراه_شيميش""}""]","[""{""eowiki"",""Béla Hankó"",""Béla_Hankó""}"", ""{""huwiki"",""Hankó Béla"",""Hankó_Béla""}"", … ""{""enwiki"",""Béla Hankó"",""Béla_Hankó""}""]","[""{""enwiki"",""Essie Summers"",""Essie_Summers""}"", ""{""dewiki"",""Essie Summers"",""Essie_Summers""}""]","[""{""enwiki"",""Lê Văn Duyệt"",""Lê_Văn_Duyệt""}"", ""{""frwiki"",""Lê Văn Duyệt"",""Lê_Văn_Duyệt""}"", … ""{""ruwiki"",""Ле Ван Зует"",""Ле_Ван_Зует""}""]","[""{""enwiki"",""Andrea Lawlor"",""Andrea_Lawlor""}""]","[""{""enwiki"",""Sheena Metal"",""Sheena_Metal""}""]","[""{""enwiki"",""Nadira Ilana"",""Nadira_Ilana""}""]"
"""descriptions""","[""{""en"",""Eunuch who had a stranglehold on power due to his close personal relationship with Emperor Xizong""}""]","[""{""en"",""American social psychologist and writer""}"", ""{""nl"",""Amerikaans academicus""}"", … ""{""de"",""US-amerikanischer Psychologe und Autor""}""]","[""{""en"",""American science fiction author (1989-)""}"", ""{""nl"",""sciencefictionschrijver""}"", … ""{""it"",""scrittore di fantascienza statunitense (1989-)""}""]","[""{""fa"",""عکاس آمریکایی""}"", ""{""en"",""American photojournalist (1939-1970)""}"", … ""{""dag"",""United States of America artist ŋun nyɛ paɣa""}""]","[""{""en"",""Byzantine aristocrat and official""}"", ""{""tr"",""Bizanslı aristokrat ve subay""}""]","[""{""en"",""Nigerian academic""}"", ""{""ar"",""أكاديمية نيجيرية""}"", ""{""es"",""académica nigeriana""}""]","[""{""en"",""neuropsychologist""}"", ""{""nl"",""psycholoog""}""]","[""{""en"",""Olympic wrestler""}"", ""{""nl"",""amateurworstelaarster uit Azerbeidzjan""}"", … ""{""uk"",""борчиня, учасниця Олімпійських ігор""}""]","[""{""de"",""Arzt in der englischen Armee""}"", ""{""en"",""19th century British surgeon known for medical reforms and personal life""}"", … ""{""he"",""כירורג בריטי""}""]","[""{""en"",""American drag queen""}"", ""{""fr"",""drage queen américaine""}"", … ""{""dag"",""O nyɛla USA yiliyiinda""}""]","[""{""en"",""19th-century Ojibwa warrior described as an ayaakwe""}"", ""{""ru"",""воин-оджибва XIX века, описанный как аяакве""}""]","[""{""es"",""compositor y cantante mexicano""}"", ""{""ast"",""cantante mexicanu""}"", … ""{""et"",""Mehhiko laulja""}""]","[""{""en"",""poet and fiction writer""}"", ""{""nl"",""Amerikaans dichter""}"", … ""{""ig"",""Onye na-ede uri na onye ode akụkọ ifo""}""]",[],"[""{""en"",""eunuch during the reign of Emperor Xianzong of Tang""}"", ""{""nl"",""politicus""}""]","[""{""en"",""Cree-Métis educator""}"", ""{""ast"",""política canadiana""}""]","[""{""en"",""English bishop and warlord""}"", ""{""nl"",""monnik""}"", ""{""de"",""Schottischer Geistlicher und Rebell""}""]","[""{""en"",""Slovakian musician and DJ""}"", ""{""cs"",""slovenská kytaristka a hudební producentka""}"", … ""{""ru"",""словацкая диджей и продюсер в жанре драм-н-бейс""}""]",[],"[""{""en"",""Japanese novelist and essayist (1978-)""}"", ""{""ar"",""روائية يابانية""}"", … ""{""ja"",""日本の小説家、エッセイスト (1978-)""}""]","[""{""en"",""Hong Kong actress and singer""}"", ""{""ru"",""китайская, гонконгская и тайваньская киноактриса, певица""}"", … ""{""pap"",""aktor chines""}""]","[""{""en"",""intersex transgender activist""}""]","[""{""en"",""American politician""}"", ""{""fr"",""personnalité politique américaine""}"", … ""{""ryu"",""アミリカ合衆国ぬ政治家""}""]","[""{""ast"",""diplomática finlandesa""}"", ""{""es"",""diplomática finlandesa""}"", … ""{""fi"",""suomalainen diplomaatti""}""]","[""{""en"",""performer, singer, activist and drag queen""}""]","[""{""en"",""Samoan–New Zealand professional wrestler""}""]","[""{""en"",""Medieval English sex-worker""}"", ""{""es"",""trabajador/a sexual inglés/a medieval""}""]","[""{""de"",""Eunuch im byzantinischen Reich""}"", ""{""en"",""Chief Court eunuch (1000-1043)""}"", ""{""es"",""eunuco de la corte del Emperador bizantino Romano III""}""]","[""{""en"",""actor""}"", ""{""nl"",""Indiaas acteur (?-2014)""}"", … ""{""cy"",""actores""}""]","[""{""en"",""American comic writer and artist""}"", ""{""es"",""ilustrador y escritor de cómics estadounidense""}"", … ""{""ca"",""il·lustrador, artista i escriptor de còmics estatunidenc""}""]","[""{""en"",""politician in Brunei""}"", ""{""ar"",""سياسية برونية""}"", … ""{""ro"",""politiciană din Brunei""}""]","[""{""en"",""American mixed martial artist""}"", ""{""he"",""אמן לחימה משולבת אמריקאי""}"", … ""{""es"",""artista marcial mixto estadounidense""}""]","[""{""en"",""Japanese screenwriter (1910-2003)""}"", ""{""ja"",""日本の脚本家""}"", … ""{""tr"",""Japon senarist (1910 – 2003)""}""]","[""{""en"",""intersex activist""}""]","[""{""it"",""cantante, compositrice e attrice statunitense""}"", ""{""ilo"",""Amerikana a kumakanta""}"", … ""{""bug"",""Penyanyi sibawa aktris Amerika""}""]","[""{""en"",""Nigerian academic""}"", ""{""es"",""académica nigeriana""}"", … ""{""ig"",""agụmakwụkwọ Naijiria""}""]",…,"[""{""en"",""English stage actor and drag queen""}""]","[""{""en"",""Commander of the Shence Armies""}"", ""{""nl"",""politicus""}""]","[""{""nl"",""natuurkundige""}"", ""{""en"",""researcher""}"", … ""{""en-us"",""researcher""}""]","[""{""en"",""American powerlifter and bodybuilder""}"", ""{""nl"",""Amerikaans model""}"", … ""{""cs"",""americká powerlifterka a bodybuilderka""}""]","[""{""en"",""(1601-1622)""}"", ""{""uk"",""(1601-1622)""}""]","[""{""en"",""American writer""}"", ""{""nl"",""Amerikaans activist""}"", … ""{""ja"",""アメリカ合衆国の作家""}""]","[""{""de"",""US-amerikanischer Videokünstler und Fotograf""}"", ""{""en"",""American artist""}"", … ""{""hy"",""ամերիկացի արվեստագետ""}""]","[""{""nl"",""museumdirecteur""}"", ""{""en"",""American museum curator""}"", ""{""fr"",""Rosenbaum, Joan""}""]","[""{""en"",""Indian indigenist and politician""}"", ""{""de"",""indische intersexuelle Person, aktivistisch tätig für Intersex- und LGBT-Rechte""}"", … ""{""ca"",""indigenista i polític indi""}""]","[""{""en"",""Indian dancer""}"", ""{""nl"",""Indiaas danseres""}"", … ""{""eu"",""dantzari indiarra""}""]","[""{""en"",""Samoan fa'afafine activist""}"", ""{""nl"",""activist uit Samoa (?-2018)""}"", … ""{""fr"",""millitante samoane""}""]","[""{""it"",""politico mongolo""}"", ""{""de"",""mongolischer Staatsführer""}"", … ""{""pl"",""mongolski polityk""}""]","[""{""de"",""Eunuch der Han-Dynastie""}"", ""{""en"",""2nd century Eastern Han dynasty eunuch official""}"", … ""{""ja"",""中国後漢末期の宦官。中常侍・大長秋。""}""]","[""{""en"",""english-American food writer and home economist""}""]","[""{""el"",""Πατριάρχης Κωνσταντινουπόλεως""}"", ""{""en"",""Patriarch of Constantinople""}"", … ""{""ca"",""patriarca de Constantinoble""}""]","[""{""en"",""Peruvian painter and printmaker (died 1984)""}"", ""{""nl"",""kunstschilderes uit Peru (1928-1984)""}"", … ""{""ja"",""ペルーの画家""}""]","[""{""fr"",""philologue classique""}"", ""{""en"",""classical philologist""}""]","[""{""en"",""English servant""}"", ""{""fr"",""Servant(e) anglais(e) en Virginie coloniale""}"", ""{""nl"",""huishoudelijke hulp""}""]","[""{""en"",""trans rights activist""}""]","[""{""de"",""chinesischer Eunuch der Ming-Dynastie""}"", ""{""en"",""Ming Dynasty eunuch, died 1449""}"", … ""{""cs"",""čínský eunuch a politik""}""]","[""{""hy"",""հայ բանասեր, թարգմանիչ""}"", ""{""nl"",""vertaalster uit Armenië""}"", … ""{""fr"",""philologue arménienne""}""]","[""{""nl"",""muzikant""}"", ""{""en"",""musician from Flint, Michigan""}""]","[""{""en"",""Mexican musician""}"", ""{""es"",""cantante, letrista, arreglista y pianista de México""}"", … ""{""cy"",""cyfansoddwr a aned yn 1988""}""]","[""{""en"",""Sri Lankan American novelist and short story writer""}"", ""{""te"",""శ్రీలంక అమెరికన్ నవలా రచయిత్రి మరియు చిన్న కథా రచయిత్రి""}"", … ""{""dag"",""United States of America karimma ŋun nyɛ paɣa""}""]","[""{""en"",""American filmmaker, actor, model, and intersex rights activist""}"", ""{""nl"",""Amerikaans filmmaker""}"", … ""{""arz"",""ناشطه حقوق ال جى بى تى من امريكا""}""]","[""{""en"",""Song Dynasty official and expert on weights and measures""}"", ""{""nl"",""politicus""}"", ""{""ast"",""persona de la dinastía Song (CBDB=0001180)""}""]","[""{""de"",""US-amerikanische Vloggerin, Comedian, Schauspielerin und Model""}"", ""{""en"",""American singer""}"", … ""{""uk"",""американська співачка""}""]","[""{""en"",""Maltese LGBT and intersex activist""}"", ""{""eo"",""GLAT-a kaj interseksula aktivulo de Malto""}"", ""{""de"",""maltesische intersexuelle Person, die sich aktivistisch für Intersex- und LGBT-Rechte einsetzt""}""]","[""{""en"",""American politician""}"", ""{""bn"",""মার্কিন রাজনীতিবিদ""}"", … ""{""ryu"",""アミリカ合衆国ぬ政治家""}""]","[""{""nl"",""historicus""}"", ""{""en"",""Welsh historian and author""}"", … ""{""cy"",""Hanesydd ac awdur o Gymro""}""]","[""{""en"",""American-Israeli artist""}"", ""{""de"",""US-amerikanisch-israelische Künstlerin""}"", … ""{""dag"",""United States of America artist ŋun nyɛ paɣa""}""]","[""{""hu"",""(1886–1959) magyar zoológus, ichthyológus, ornitológus, muzeológus, egyetemi tanár, természettudományi szakíró""}"", ""{""nl"",""Hongaars ichtyoloog (1886-1959)""}"", … ""{""es"",""zoólogo húngaro""}""]","[""{""de"",""neuseeländische Schriftstellerin""}"", ""{""fa"",""نویسنده نیوزلندی""}"", … ""{""dv"",""ލިޔުންތެރިއެއް""}""]","[""{""en"",""Vietnamese general, mandarin""}"", ""{""nl"",""officier uit Vietnam (?-1832)""}"", … ""{""fr"",""général et mandarin vietnamien""}""]","[""{""en"",""writer and professor of English""}""]","[""{""nl"",""Amerikaans radiopresentatrice""}"", ""{""en"",""American talk-show host""}""]","[""{""en"",""Malaysian writer, filmmaker and activist""}"", ""{""cy"",""cyfarwyddwr ffilm""}""]"
"""labels""","[""{""en"",""Tian Lingzi""}"", ""{""zh"",""田令孜""}"", … ""{""zh-hans"",""田令孜""}""]","[""{""en"",""Devon Price""}"", ""{""nl"",""Devon Price""}"", … ""{""eu"",""Devon Price""}""]","[""{""en"",""Rivers Solomon""}"", ""{""nl"",""Rivers Solomon""}"", … ""{""zh-hans"",""里弗斯·所罗门""}""]","[""{""en"",""Dana Stone""}"", ""{""es"",""Dana Stone""}"", … ""{""pt-br"",""Dana Stone""}""]","[""{""fr"",""Constantin Gongylès""}"", ""{""sh"",""Konstantin Gongil""}"", … ""{""ru"",""Константин Гонгил""}""]","[""{""en"",""Marietu Tenuche""}"", ""{""nl"",""Marietu Tenuche""}"", … ""{""sl"",""Professor Marietu Ohunene Tenuche""}""]","[""{""en"",""Barbara J. Sahakian""}"", ""{""de"",""Barbara J. Sahakian""}"", … ""{""tr"",""Barbara J. Sahakian""}""]","[""{""en"",""Yelena Komarova""}"", ""{""fr"",""Yelena Komarova""}"", … ""{""az"",""Yelena Komarova""}""]","[""{""de"",""James Barry""}"", ""{""en"",""James Barry""}"", … ""{""az"",""Ceyms Berri""}""]","[""{""en"",""Miss Fame""}"", ""{""nl"",""Miss Fame""}"", … ""{""sl"",""Miss Fame""}""]","[""{""en"",""Ozaawindib""}"", ""{""fr"",""Ozaawindib""}"", … ""{""ru"",""Озаавиндиб""}""]","[""{""es"",""Joss Favela""}"", ""{""nl"",""Joss Favela""}"", … ""{""pt-br"",""Joss Favela""}""]","[""{""en"",""Ebele Oseye""}"", ""{""fr"",""Ebele Oseye""}"", … ""{""ig"",""Ebele Oseye""}""]","[""{""en"",""Gray Johnson Poole""}"", ""{""de"",""Gray Johnson Poole""}"", … ""{""sq"",""Gray Poole""}""]","[""{""en"",""Tutu Chengcui""}"", ""{""zh"",""吐突承璀""}"", … ""{""ast"",""Tutu Chengcui""}""]","[""{""en"",""Lori Campbell""}"", ""{""nl"",""Lori Campbell""}"", ""{""ast"",""Lori Campbell""}""]","[""{""en"",""Wimund""}"", ""{""nb"",""Wimund""}"", … ""{""sq"",""Wimund""}""]","[""{""sk"",""B-complex""}"", ""{""cs"",""B-Complex""}"", … ""{""uk"",""B-Complex""}""]","[""{""fr"",""Mathilde Stuyvesant""}"", ""{""pt"",""Mathilde Stuyvesant""}"", … ""{""pt-br"",""Mathilde Stuyvesant""}""]","[""{""ja"",""山崎ナオコーラ""}"", ""{""en"",""Nao-cola Yamazaki""}"", … ""{""tr"",""Nao-cola Yamazaki""}""]","[""{""nan"",""Lêng Pho""}"", ""{""zh-hans"",""凌波""}"", … ""{""pap"",""Ling Po""}""]","[""{""sq"",""Kristian Ranđelović""}"", ""{""en"",""Kristian Ranđelović""}"", … ""{""sr"",""Кристиан Ранђеловић""}""]","[""{""en"",""Laurin Hendrix""}"", ""{""fr"",""Laurin Hendrix""}"", … ""{""ast"",""Laurin Hendrix""}""]","[""{""fi"",""Pekka J. Korvenheimo""}"", ""{""en"",""Pekka J. Korvenheimo""}"", … ""{""cs"",""Pekka J. Korvenheimo""}""]","[""{""en"",""Maya the drag queen""}"", ""{""de"",""Maya the drag queen""}"", … ""{""pa"",""ਮਾਇਆ ਦ ਡਰੈਗ ਕੁਈਨ""}""]","[""{""en"",""Leilani Tominiko""}""]","[""{""en"",""John/Eleanor Rykener""}"", ""{""pt"",""John Rykener""}"", … ""{""ar"",""جون/إيليانور رايكنر""}""]","[""{""pt"",""João, o Eunuco""}"", ""{""de"",""Johannes Orphanotrophos""}"", … ""{""it"",""Giovanni l'Orfanotrofo""}""]","[""{""en"",""Suruli Manohar""}"", ""{""fr"",""Suruli Manohar""}"", … ""{""bn"",""সুরুলি মনোহর""}""]","[""{""en"",""N.D. Stevenson""}"", ""{""nl"",""N.D. Stevenson""}"", … ""{""eu"",""ND Stevenson""}""]","[""{""en"",""Romaizah binti Haji Mohd Salleh""}""]","[""{""de"",""Brendan Allen""}"", ""{""es"",""Brenda Allen""}"", … ""{""ar"",""بريندان ألين""}""]","[""{""ja"",""水木洋子""}"", ""{""en"",""Yōko Mizuki""}"", … ""{""ar"",""يوكو ميزوكي""}""]","[""{""en"",""Anton Kryzhanovsky""}"", ""{""de"",""Anton Kryzhanovsky""}"", … ""{""pt-br"",""Anton Kryzhanovsky""}""]","[""{""zh"",""黛咪·洛瓦特""}"", ""{""jv"",""Demi Lovato""}"", … ""{""dtp"",""Demi Lovato""}""]","[""{""en"",""Angela Miri""}"", ""{""nl"",""Angela Miri""}"", … ""{""sl"",""Angela Miri""}""]",…,"[""{""en"",""Iz Hesketh""}"", ""{""de"",""Iz Hesketh""}"", … ""{""pt-br"",""Iz Hesketh""}""]","[""{""en"",""Liu Jishu""}"", ""{""zh"",""劉季述""}"", … ""{""ja"",""劉季述""}""]","[""{""et"",""Romi Mankin""}"", ""{""es"",""Romi Mankin""}"", … ""{""ast"",""Romi Mankin""}""]","[""{""en"",""Janae Kroc""}"", ""{""cs"",""Janae Marie Kroc""}"", … ""{""fa"",""جنای کروک""}""]","[""{""en"",""Conn O'Neill""}"", ""{""nl"",""Conn O'Neill""}"", … ""{""ga"",""Conn Ó Néill""}""]","[""{""en"",""Elisa Rae Shupe""}"", ""{""nl"",""Elisa Rae Shupe""}"", … ""{""ja"",""エリサ・レー・シュウプ""}""]","[""{""de"",""Ira Schneider""}"", ""{""en"",""Ira Schneider""}"", … ""{""tr"",""Ira Schneider""}""]","[""{""en"",""Joan Rosenbaum""}"", ""{""nl"",""Joan Rosenbaum""}"", … ""{""cs"",""Joan Rosenbaum""}""]","[""{""en"",""Gopi Shankar Madurai""}"", ""{""nl"",""Gopi Shankar Madurai""}"", … ""{""ca"",""Gopi Shankar Madurai""}""]","[""{""ta"",""நர்த்தகி நடராஜ்""}"", ""{""en"",""Narthaki Nataraj""}"", … ""{""ga"",""Nartaki Natraj""}""]","[""{""en"",""To’oto’oali’I Roger Stanley""}"", ""{""nl"",""To’oto’oali’I Roger Stanley""}"", … ""{""sq"",""To’oto’oali’I Roger Stanley""}""]","[""{""zh-hans"",""尤睦佳·泽登巴尔""}"", ""{""zh-hant"",""尤睦佳·澤登巴爾""}"", … ""{""kk"",""Цеденбал Юмжагийн""}""]","[""{""zh-hans"",""曹腾""}"", ""{""zh-hant"",""曹騰""}"", … ""{""th"",""โจเท้ง""}""]","[""{""en"",""Jean Hewitt""}"", ""{""de"",""Jean Hewitt""}"", ""{""nl"",""Jean Hewitt""}""]","[""{""pt"",""Nicetas I de Constantinopla""}"", ""{""sk"",""Nikétas I.""}"", … ""{""ca"",""Nicetes I""}""]","[""{""en"",""Tilsa Tsuchiya""}"", ""{""es"",""Tilsa Tsuchiya""}"", … ""{""uk"",""Тільса Цучія""}""]","[""{""en"",""Isabel Ruffell""}"", ""{""fr"",""Isabel Ruffell""}""]","[""{""en"",""Thomas(ine) Hall""}"", ""{""fr"",""Thomas(ine) Hall""}"", … ""{""ru"",""Холл, Томас(ин)""}""]","[""{""en"",""Ymania Brown""}"", ""{""nl"",""Ymania Brown""}"", ""{""sq"",""Ymania Brown""}""]","[""{""zh-hans"",""王振""}"", ""{""zh-hant"",""王振""}"", … ""{""pt"",""Wang Zhen (eunuco)""}""]","[""{""hy"",""Գոհար Մուրադյան""}"", ""{""en"",""Gohar Muradyan""}"", … ""{""fr"",""Gohar Muradyan""}""]","[""{""en"",""Tunde Olaniran""}"", ""{""nl"",""Tunde Olaniran""}"", … ""{""ig"",""Tunde Olaniran""}""]","[""{""es"",""Flor Amargo""}"", ""{""en"",""Flor Amargo""}"", … ""{""es-419"",""Flor Amargo""}""]","[""{""en"",""SJ Sindu""}"", ""{""nl"",""SJ Sindu""}"", … ""{""dag"",""SJ Sindu""}""]","[""{""en"",""River Gallo""}"", ""{""ast"",""River Gallo""}"", … ""{""arz"",""ريفر جالو""}""]","[""{""en"",""Liu Chenggui""}"", ""{""zh"",""劉承規""}"", … ""{""zh-hans"",""刘承规""}""]","[""{""de"",""Tessa Violet""}"", ""{""en"",""Tessa Violet""}"", … ""{""yo"",""Tessa Violet""}""]","[""{""en"",""Ruth Baldacchino""}"", ""{""nl"",""Ruth Baldacchino""}"", … ""{""sq"",""Ruth Baldacchino""}""]","[""{""fr"",""Debbie Farhat""}"", ""{""es"",""Debbie Farhat""}"", … ""{""ast"",""Debbie Farhat""}""]","[""{""cy"",""J. Beverley Smith""}"", ""{""en"",""J. Beverley Smith""}"", … ""{""sk"",""J. Beverley Smith""}""]","[""{""en"",""Ophrah Shemesh""}"", ""{""it"",""Ophrah Shemesh""}"", … ""{""ga"",""Ophrah Shemesh""}""]","[""{""eo"",""Béla Hankó""}"", ""{""hu"",""Hankó Béla""}"", … ""{""tr"",""Béla Hankó""}""]","[""{""de"",""Essie Summers""}"", ""{""en"",""Essie Summers""}"", … ""{""he"",""סמרס, אסי""}""]","[""{""zh-hans"",""黎文悦""}"", ""{""zh-hant"",""黎文悅""}"", … ""{""ga"",""Le Van Duyet""}""]","[""{""en"",""Andrea Lawlor""}"", ""{""nl"",""Andrea Lawlor""}""]","[""{""en"",""Sheena Metal""}"", ""{""nl"",""Sheena Metal""}"", … ""{""ca"",""Sheena Metal""}""]","[""{""en"",""Nadira Ilana""}"", ""{""nl"",""Nadira Ilana""}"", … ""{""fr"",""Nadira Ilana""}""]"
"""aliases""","[""{""en"",""Lingzi Tian""}""]",[],[],"[""{""en"",""Dana Hazen Stone""}"", ""{""fr"",""Dana Hazen Stone""}""]","[""{""sh"",""Konstantin Gongiles""}""]","[""{""en"",""Marietu Ohunene Tenuche""}""]","[""{""da"",""Barbara Sahakian""}"", ""{""da"",""Barbara J Sahakian""}"", … ""{""de"",""Barbara Sahakian""}""]",[],"[""{""fr"",""James Miranda Stuart Barry""}"", ""{""fr"",""Margaret Ann Bulkley""}"", … ""{""he"",""מרגרט אן בלקלי""}""]","[""{""en"",""Kurtis Dam-Mikkelsen""}"", ""{""de"",""Kurtis Dam-Mikkelsen""}"", … ""{""sv"",""Kurtis Dam-Mikkelsen""}""]",[],"[""{""ast"",""José Alberto Inzunza Favela""}"", ""{""ca"",""José Alberto Inzunza Favela""}"", … ""{""tr"",""José Alberto Inzunza Favela""}""]","[""{""en"",""Ellease Southerland""}"", ""{""it"",""Ellease Southerland""}"", … ""{""ig"",""Ellease Southerland""}""]","[""{""en"",""Poole, Gray Johnson""}"", ""{""en"",""Gray Poole""}"", … ""{""pt-br"",""Gray Poole""}""]",[],[],"[""{""de"",""Wimond""}"", ""{""de"",""Wymond""}""]","[""{""sk"",""Matúš Lenický""}"", ""{""sk"",""Matia Lenická""}"", … ""{""ru"",""Матуш Леницкий""}""]","[""{""en"",""Mathilde, Princess Alexandre de Caraman Chimay""}""]","[""{""en"",""Yamazaki Nao-cola""}"", ""{""en"",""Naokōra Yamazaki""}"", … ""{""ja"",""やまざき ナオコーラ""}""]","[""{""ru"",""Лин По, Айви""}"", ""{""ru"",""Лин Бо""}"", … ""{""ga"",""Ling Po""}""]","[""{""ru"",""Ранджелович, Кристиан""}""]","[""{""en"",""Michael Laurin Hendrix""}""]","[""{""en"",""Pekka Juhani Korvenheimo""}"", ""{""fi"",""Pekka Juhani Korvenheimo""}""]","[""{""en"",""Alex Mathew""}""]","[""{""en"",""Candy Lee""}""]","[""{""en"",""Eleanor Rykener""}"", ""{""en"",""John Rykener""}"", … ""{""es"",""John Rykener""}""]","[""{""de"",""Orphanotrophos""}"", ""{""nl"",""Johannes Orphanotrophos""}""]",[],"[""{""ja"",""ノエル・スティーブンソン""}"", ""{""fr"",""Gingerhaze""}"", … ""{""ast"",""Noelle Stevenson""}""]","[""{""en"",""Datin Seri Paduka Dr Dayang Hajah Romaizah Binti Haji Mohd Salleh""}"", ""{""en"",""Romaizah Mohd Salleh""}""]","[""{""es"",""Brendan Allen""}"", ""{""es"",""Brendan Cody Allen""}"", … ""{""en"",""Brendan Cody Allen""}""]","[""{""en"",""Mizuki Yōko""}"", ""{""en"",""Yoko Mizuki""}"", … ""{""ko"",""요코 미즈키""}""]","[""{""en"",""Anton Krzyzanowski""}""]","[""{""zh"",""Demi Lovato""}"", ""{""zh"",""黛米·洛瓦托""}"", … ""{""bug"",""Demetria Devonne Lovato""}""]","[""{""en"",""Angela Freeman Miri""}"", ""{""ha"",""Angela Freeman Miri""}"", … ""{""sl"",""Professor Angela Freeman Miri""}""]",…,[],[],[],"[""{""pl"",""Matthew Raymond Kroczaleski""}"", ""{""en"",""Matt Kroczaleski""}"", … ""{""es"",""Janae Marie Kroczaleski""}""]","[""{""en"",""Conn 'na Creige' O'Neill""}""]","[""{""en"",""Jamie Shupe""}"", ""{""en"",""James Clifford Shupe""}"", … ""{""ja"",""エリサ・レー・シュープ""}""]",[],[],"[""{""zh"",""戈皮·香卡·马杜赖""}""]","[""{""en"",""Nartaki Natraj""}"", ""{""sq"",""Nartaki Natraj""}""]",[],"[""{""pl"",""Jumdżagijn Cedenbal""}"", ""{""pl"",""Jumdżagijn Cedenbał""}"", … ""{""pt-br"",""Yumzhagiin Tsedenbal""}""]","[""{""de"",""Jixing""}"", ""{""zh-hant"",""高皇帝""}"", … ""{""ja"",""高皇帝""}""]",[],"[""{""sk"",""Nikétas""}"", ""{""sh"",""Vaseljenski patrijarh Nikita I od Konstantinopola""}"", … ""{""ca"",""Nicetes I de Constantinoble""}""]","[""{""es"",""Tilsa Tsuchiya Castillo""}"", ""{""es"",""Tilsa Tschusiya""}"", … ""{""sl"",""Tilsa Tschusiya""}""]","[""{""en"",""Ian Ruffell""}"", ""{""en"",""I. Ruffell""}"", … ""{""fr"",""Ian Ruffell""}""]","[""{""en"",""Thomas Hall""}"", ""{""en"",""Thomasine Hall""}"", … ""{""fr"",""Thomas/ine Hall""}""]",[],"[""{""de"",""Wáng Zhēn""}"", ""{""en"",""Jingzhong""}"", … ""{""zh"",""旌忠""}""]","[""{""hy"",""Մուրադյան, Գոհար""}"", ""{""gl"",""Gohar Sarkis Muradyan""}"", … ""{""de"",""Gohar Mowradean""}""]",[],"[""{""en"",""Emma Mayte Carballo Hernández""}""]","[""{""en"",""S.J. Sindu""}"", ""{""fr"",""S.J. Sindu""}"", … ""{""sv"",""S.J. Sindu""}""]","[""{""ru"",""Галло, Ривер""}""]","[""{""zh"",""忠肅""}"", ""{""zh"",""劉承珪""}"", … ""{""lzh"",""劉承規""}""]","[""{""de"",""Meekakitty""}"", ""{""hu"",""Meekakitty""}"", … ""{""ru"",""Вайолет, Тесса""}""]","[""{""ru"",""Балдаккино, Рут""}""]",[],"[""{""en"",""Jenkyn Beverley Smith""}"", ""{""cs"",""Jenkyn Beverley Smith""}"", ""{""cy"",""Jenkyn Beverley Smith""}""]","[""{""he"",""עפרה שמש""}"", ""{""de"",""Opra Shemesh""}"", … ""{""nl"",""Opra Shemesh""}""]","[""{""en"",""Bela Hanko""}"", ""{""en"",""Hankó""}"", … ""{""tr"",""Bela Hanko""}""]","[""{""en"",""Ethel Snelson Summers""}"", ""{""en"",""Ethel Summers""}"", … ""{""he"",""סומרס, אסי""}""]","[""{""ko"",""예문열""}"", ""{""ko"",""레반두예트""}"", … ""{""ja"",""黎文悅""}""]",[],[],[]


In [60]:
(info_df['numeric_id'] == info_df['id'].str.replace('Q', '')).all()

True

# Time Series Clustering

In [None]:
views_df = pl.read_parquet(PROCESSED_DATA_DIR / 'pageviews_all.parquet')

In [98]:
views_df[:30]

title,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10,2015-11,2015-12,2016-01,2016-02,2016-03,2016-04,2016-05,2016-06,2016-07,2016-08,2016-09,2016-10,2016-11,2016-12,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04,…,2020-12,2021-01,2021-02,2021-03,2021-04,2021-05,2021-06,2021-07,2021-08,2021-09,2021-10,2021-11,2021-12,2022-01,2022-02,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12,2023-01,2023-02,2023-03,2023-04,2023-05,2023-06,2023-07,2023-08,2023-09,2023-10,2023-11,2023-12
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""Neophyte_II_of…",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,14.0,17.0,26.0,10.0,17.0,24.0,29.0,32.0,51.0,20.0,36.0,46.0,32.0,119.0,27.0,32.0,18.0,26.0,22.0,30.0,30.0,26.0,35,33,33,26,25,24,29,44.0,23,25
"""Olivia_Giovett…",26.0,34.0,52.0,20.0,41.0,40.0,19.0,21.0,27.0,23.0,32.0,21.0,32.0,29.0,57.0,47.0,31.0,21.0,24.0,15.0,42.0,17.0,45.0,30.0,25.0,34.0,36.0,22.0,22.0,22.0,16.0,28.0,30.0,34.0,22.0,19.0,…,39.0,30.0,32.0,34.0,37.0,29.0,14.0,26.0,24.0,36.0,39.0,35.0,20.0,63.0,33.0,39.0,19.0,19.0,30.0,20.0,14.0,21.0,35.0,52.0,25.0,25.0,33.0,20,18,22,31,26,16,20,59.0,41,19
"""Alexander_Gran…",9.0,48.0,24.0,23.0,32.0,24.0,12.0,20.0,19.0,7.0,8.0,14.0,18.0,27.0,11.0,23.0,12.0,19.0,15.0,17.0,11.0,13.0,26.0,14.0,16.0,18.0,13.0,20.0,13.0,26.0,16.0,19.0,32.0,14.0,18.0,12.0,…,5.0,13.0,9.0,12.0,11.0,9.0,14.0,19.0,9.0,7.0,5.0,11.0,4.0,11.0,9.0,7.0,9.0,9.0,4.0,5.0,12.0,72.0,6.0,10.0,10.0,10.0,2.0,7,10,7,2,7,4,9,10.0,1,5
"""Bernard_H._Rae…",17.0,16.0,21.0,20.0,19.0,18.0,10.0,9.0,17.0,5.0,8.0,10.0,5.0,28.0,19.0,11.0,17.0,14.0,13.0,19.0,16.0,13.0,24.0,18.0,10.0,10.0,14.0,13.0,14.0,19.0,5.0,15.0,26.0,12.0,13.0,10.0,…,2.0,1.0,4.0,1.0,5.0,5.0,3.0,5.0,2.0,2.0,3.0,4.0,,12.0,4.0,3.0,2.0,1.0,4.0,,1.0,3.0,8.0,6.0,3.0,1.0,5.0,1,3,5,7,3,1,2,,1,2
"""Leigh_Magar""",47.0,46.0,64.0,36.0,56.0,45.0,41.0,26.0,30.0,48.0,15.0,26.0,19.0,38.0,28.0,38.0,37.0,51.0,24.0,36.0,17.0,15.0,45.0,39.0,26.0,22.0,21.0,14.0,19.0,23.0,16.0,32.0,36.0,21.0,18.0,20.0,…,8.0,8.0,9.0,13.0,6.0,3.0,12.0,13.0,12.0,5.0,8.0,8.0,11.0,3.0,9.0,8.0,13.0,13.0,9.0,20.0,9.0,8.0,14.0,27.0,14.0,13.0,20.0,13,10,9,5,9,9,8,5.0,10,2
"""Carolyn_Stanfo…",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,221.0,294.0,359.0,286.0,650.0,202.0,170.0,169.0,83.0,97.0,107.0,112.0,70.0,78.0,88.0,64.0,54.0,78.0,84.0,74.0,131.0,74.0,86.0,180.0,105.0,75.0,76.0,73,105,56,50,89,86,59,58.0,50,42
"""Boncho_Novakov…",16.0,18.0,26.0,19.0,21.0,16.0,11.0,14.0,18.0,6.0,6.0,15.0,4.0,10.0,10.0,15.0,14.0,12.0,11.0,18.0,14.0,4.0,22.0,13.0,15.0,16.0,8.0,13.0,11.0,8.0,7.0,14.0,18.0,9.0,7.0,11.0,…,4.0,7.0,5.0,3.0,18.0,39.0,5.0,3.0,5.0,5.0,2.0,4.0,5.0,3.0,5.0,5.0,4.0,6.0,7.0,3.0,4.0,5.0,5.0,5.0,3.0,9.0,2.0,3,4,4,3,7,9,2,3.0,2,3
"""Margaret_Hodge…",5665.0,2932.0,2420.0,2482.0,3097.0,2693.0,1955.0,1867.0,3365.0,2059.0,1808.0,3435.0,4174.0,12893.0,4331.0,4311.0,3434.0,2573.0,2110.0,1829.0,1813.0,1953.0,1667.0,2523.0,2442.0,3777.0,1705.0,1577.0,5159.0,1650.0,3699.0,1623.0,2057.0,1368.0,2277.0,2749.0,…,2260.0,1819.0,2069.0,1605.0,1630.0,1890.0,2082.0,1391.0,1126.0,1351.0,2217.0,1430.0,4400.0,1788.0,2612.0,1680.0,1471.0,1743.0,1336.0,2097.0,1272.0,1918.0,2196.0,1194.0,1204.0,1536.0,2234.0,1909,1311,1261,3170,2097,1026,1335,5767.0,2327,3310
"""Carmen_Belén_R…",,,,,,25.0,26.0,31.0,27.0,60.0,35.0,32.0,29.0,20.0,20.0,51.0,23.0,31.0,37.0,31.0,59.0,46.0,51.0,35.0,28.0,31.0,34.0,34.0,25.0,42.0,27.0,39.0,118.0,56.0,36.0,46.0,…,17.0,24.0,56.0,80.0,25.0,32.0,31.0,13.0,24.0,25.0,18.0,27.0,27.0,18.0,46.0,54.0,20.0,24.0,30.0,12.0,14.0,12.0,25.0,27.0,10.0,31.0,36.0,63,29,12,10,15,36,20,15.0,20,17
"""M._Satish_Redd…",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32.0,49.0,104.0,102.0,84.0,138.0,277.0,…,111.0,152.0,78.0,79.0,62.0,590.0,93.0,119.0,317.0,128.0,66.0,63.0,99.0,78.0,79.0,104.0,109.0,121.0,200.0,129.0,144.0,156.0,121.0,158.0,266.0,233.0,280.0,313,517,1188,155,119,136,128,90.0,124,158


In [118]:
X_train = views_df.select([col for col in views_df.columns if col != 'title']).fill_null(0).to_numpy()
X_train

array([[ 0,  0,  0, ..., 44, 23, 25],
       [26, 34, 52, ..., 59, 41, 19],
       [ 9, 48, 24, ..., 10,  1,  5],
       ...,
       [40, 29, 34, ...,  3,  1,  2],
       [23, 52, 38, ...,  8,  7,  6],
       [ 0,  0,  0, ..., 16,  8,  7]])

In [119]:
X_train.shape

(1976658, 104)

In [120]:
import numpy
import matplotlib.pyplot as plt

from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, \
    TimeSeriesResampler

seed = 0
numpy.random.seed(seed)

sz = X_train.shape[1]

X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train)

# Euclidean k-means
print("Euclidean k-means")
km = TimeSeriesKMeans(n_clusters=3, verbose=True, random_state=seed,
n_jobs=-1)
y_pred = km.fit_predict(X_train)

plt.figure()
for yi in range(3):
    plt.subplot(3, 3, yi + 1)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Euclidean $k$-means")

# DBA-k-means
print("DBA k-means")
dba_km = TimeSeriesKMeans(n_clusters=3,
                          n_init=2,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10,
                          random_state=seed,
                          n_jobs=-1)
y_pred = dba_km.fit_predict(X_train)

for yi in range(3):
    plt.subplot(3, 3, 4 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("DBA $k$-means")


plt.tight_layout()
plt.show()

143.854 --> 76.592 --> 

KeyboardInterrupt: 

In [112]:
km.inertia_

70.60124743054607

In [114]:
y_pred

array([2, 1, 1, 0, 0, 2, 0, 1, 2, 1, 2, 2, 0, 0, 2, 0, 0, 1, 1, 2, 0, 1,
       2, 2, 0, 0, 1, 0, 0, 2, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 2, 0, 1, 0, 2, 1, 0, 2, 0, 2, 0, 1, 2,
       2, 2, 0, 0, 1, 1, 2, 2, 1, 1, 0, 1, 1, 0, 2, 0, 2, 0, 2, 1, 2, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 2, 2, 2])

In [115]:
y = views_df[:100].with_columns(pl.Series(name='cluster', values=y_pred))

https://github.com/aeon-toolkit/aeon/blob/4a58c4dba5bb37e43979368ad9cf092540e71786/examples/segmentation/segmentation_with_clasp.ipynb