<div align='center'><strong>Wikipedia Pageviews Project</strong>
<br />
<i>Netanel Madmoni</i>
</div>

----------------

# Introduction

In [1]:
#imports
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
from tqdm import tqdm
from functools import reduce
from itertools import product
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
import json
import seaborn as sns
from rich import print
from IPython.display import Image

load_dotenv()

RAW_DATA_DIR = Path(os.getenv('RAW_DATA_DIR'))
PROCESSED_DATA_DIR = Path(os.getenv('PROCESSED_DATA_DIR'))

# Raw Data Acquisition

1. Get List of people on Wikipedia
```sql
SELECT DISTINCT en_wiki -- page title name in english wikipedia	
FROM  `rising-theater-416315.wikipedia_pageviews.wikidata`,
      UNNEST(instance_of) AS instance_of_struct

    
WHERE instance_of_struct.numeric_id = 5 -- instance_of = 5 => person
```

2. Get pagevoew data for those people

      ```sql
      SELECT title, DATETIME_TRUNC(datehour, MONTH) AS month, SUM(views) AS monthly_views

                  
      FROM  `rising-theater-416315.wikipedia_pageviews.pageviews_2023` a
            JOIN `rising-theater-416315.data_for_project.distinct_people` b
            ON a.title = b.en_wiki

      
      WHERE datehour IS NOT NULL
      AND wiki = "en"

      GROUP BY title, DATETIME_TRUNC(datehour, MONTH)
      ```

3. Get wikidata for those people
      ```sql
      SELECT *	
      FROM  `rising-theater-416315.wikipedia_pageviews.wikidata`,
      UNNEST(instance_of) AS instance_of_struct

    
      WHERE instance_of_struct.numeric_id = 5
      ```

# Data transformation

## Views Data

Are organized in csv files, one per year.

In [6]:
# Read raw data
dfs = []
for file in RAW_DATA_DIR.glob(r'monthly_views_*.csv'):
    print(f'{file.name} - {os.stat(file).st_size / 1024 ** 2:.2f} MB)')
    dfs.append(pl.read_csv(file))

print(f'Total files: {len(dfs)}')

monthly_views_2015.csv - 411.06 MB)
monthly_views_2016.csv - 658.91 MB)
monthly_views_2019.csv - 803.42 MB)
monthly_views_2020.csv - 851.54 MB)
monthly_views_2017.csv - 708.39 MB)
monthly_views_2022.csv - 949.62 MB)
monthly_views_2023.csv - 980.47 MB)
monthly_views_2021.csv - 904.32 MB)
monthly_views_2018.csv - 755.98 MB)
Total files: 0


In [3]:
try:
    views_df = pl.read_parquet(PROCESSED_DATA_DIR / 'pageviews_all.parquet')
except FileNotFoundError:
    # Pivot
    transformed_dfs = []
    for df in tqdm(dfs):
        transformed_df = (df.with_columns(pl.col('month')
                                        .map_elements(lambda s: f'{s[:4]}-{s[5:7]}')
                                        .alias('year_month'))
                        .pivot(index='title', columns='year_month', values='monthly_views')
        )
        transformed_dfs.append(transformed_df)
    # Join
    all_names = pl.concat([df.select('title') for df in dfs]).unique()
    print(f'{len(all_names):,}')
    views_df = all_names
    for i, df in tqdm(enumerate(transformed_dfs)):
        views_df = df.join(views_df, 'title', 'outer_coalesce')
    views_df = views_df.select(['title', *sorted([c for c in views_df.columns if c != 'title'])])
    views_df.write_parquet(f'{PROCESSED_DATA_DIR}/pageviews_all.parquet')

views_df

title,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10,2015-11,2015-12,2016-01,2016-02,2016-03,2016-04,2016-05,2016-06,2016-07,2016-08,2016-09,2016-10,2016-11,2016-12,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04,…,2020-12,2021-01,2021-02,2021-03,2021-04,2021-05,2021-06,2021-07,2021-08,2021-09,2021-10,2021-11,2021-12,2022-01,2022-02,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12,2023-01,2023-02,2023-03,2023-04,2023-05,2023-06,2023-07,2023-08,2023-09,2023-10,2023-11,2023-12
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""Neophyte_II_of…",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,14,17,26,10,17,24,29,32,51,20,36,46,32,119,27,32,18,26,22,30,30,26,35,33,33,26,25,24,29,44,23,25
"""Olivia_Giovett…",26,34,52,20,41,40,19,21,27,23,32,21,32,29,57,47,31,21,24,15,42,17,45,30,25,34,36,22,22,22,16,28,30,34,22,19,…,39,30,32,34,37,29,14,26,24,36,39,35,20,63,33,39,19,19,30,20,14,21,35,52,25,25,33,20,18,22,31,26,16,20,59,41,19
"""Alexander_Gran…",9,48,24,23,32,24,12,20,19,7,8,14,18,27,11,23,12,19,15,17,11,13,26,14,16,18,13,20,13,26,16,19,32,14,18,12,…,5,13,9,12,11,9,14,19,9,7,5,11,4,11,9,7,9,9,4,5,12,72,6,10,10,10,2,7,10,7,2,7,4,9,10,1,5
"""Bernard_H._Rae…",17,16,21,20,19,18,10,9,17,5,8,10,5,28,19,11,17,14,13,19,16,13,24,18,10,10,14,13,14,19,5,15,26,12,13,10,…,2,1,4,1,5,5,3,5,2,2,3,4,,12,4,3,2,1,4,,1,3,8,6,3,1,5,1,3,5,7,3,1,2,,1,2
"""Leigh_Magar""",47,46,64,36,56,45,41,26,30,48,15,26,19,38,28,38,37,51,24,36,17,15,45,39,26,22,21,14,19,23,16,32,36,21,18,20,…,8,8,9,13,6,3,12,13,12,5,8,8,11,3,9,8,13,13,9,20,9,8,14,27,14,13,20,13,10,9,5,9,9,8,5,10,2
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Edgar_Dibden""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16,6,10,6,7,4,2
"""David_Mulready…",56,51,78,51,54,59,36,44,31,33,29,29,29,54,44,45,23,36,38,35,32,31,42,27,44,33,27,52,29,41,21,36,47,64,37,30,…,16,21,27,33,15,22,31,15,25,16,25,12,25,27,31,18,27,25,20,21,34,15,16,14,15,19,26,19,18,12,7,19,12,14,14,7,6
"""Bruno_Riem""",40,29,34,17,31,29,22,31,14,8,8,16,13,16,7,17,24,16,20,14,14,7,23,16,25,14,13,15,13,20,10,17,31,14,13,12,…,4,9,2,4,5,4,7,8,6,7,8,5,5,4,4,7,2,5,4,11,2,4,5,5,3,5,2,2,4,3,7,5,4,3,3,1,2
"""Bon_Spence""",23,52,38,26,40,23,16,22,21,12,18,13,31,10,30,27,17,14,11,19,18,10,29,27,24,17,24,19,19,26,21,27,48,17,17,19,…,13,5,15,10,8,14,9,13,16,8,10,12,7,14,8,11,6,6,4,10,12,14,10,14,10,12,5,4,10,17,11,8,8,10,8,7,6


### Additional Features Extraction

## Entity Data

### Reading the Data

Are organized in json files.

In [2]:
info_files = (RAW_DATA_DIR / 'wikidata').glob('*')
len(list(info_files))

400

For example...

In [86]:
info_files = (RAW_DATA_DIR / 'wikidata').glob('*')
with open(next(info_files)) as f:
    for line in f:
        d = json.loads(line)
        sites = [sitelink['site'] for sitelink in d['sitelinks']]
        if 'enwiki' in sites:
            print(d)
            break
    

In [91]:
try:
    info_df = pl.read_parquet(PROCESSED_DATA_DIR / 'wikidata_all.parquet')
except FileNotFoundError:
    info_list = []
    info_files = (RAW_DATA_DIR / 'wikidata').glob('*')
    for file in info_files:
        with open(file) as f:
            for line in f:
                d = json.loads(line)
                sites = [sitelink['site'] for sitelink in d['sitelinks']]
                if 'enwiki' in sites:
                    info_list.append(d)
    info_df = pl.DataFrame(info_list)

with pl.Config(tbl_rows=3):
    display(info_df)

id,numeric_id,en_label,en_wiki,en_description,type,sitelinks,descriptions,labels,aliases,instance_of,gender,date_of_birth,date_of_death,worked_at,country_of_citizenship,country,educated_at,occupation,instrument,genre,industry,subclass_of,coordinate_location,iso_3166_alpha3,member_of,from_fictional_universe
str,str,str,str,str,str,list[struct[3]],list[struct[2]],list[struct[2]],list[struct[2]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[2]],list[null],list[struct[1]],list[struct[1]]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""item""","[{""dewiki"",""Albrecht Joseph"",""Albrecht_Joseph""}, {""enwiki"",""Albrecht Joseph"",""Albrecht_Joseph""}, … {""fawiki"",""آلبرشت یوزف"",""آلبرشت_یوزف""}]","[{""de"",""deutsch-amerikanischer Theater- und Filmschaffender""}, {""fa"",""فیلمنامه‌نویس، تدوینگر، و نویسنده آلمانی""}, … {""sv"",""tysk författare och filmklippare""}]","[{""de"",""Albrecht Joseph""}, {""fr"",""Albrecht Joseph""}, … {""ru"",""Элбрехт Джозеф""}]","[{""de"",""Al Joseph""}, {""en"",""Al Joseph""}, … {""sk"",""Al Joseph""}]","[{""5""}]","[{""6581097""}]","[{""+1901-11-20T00:00:00Z""}]","[{""+1901-11-20T00:00:00Z""}]",[],"[{""183""}]",[],[],"[{""28389""}, {""7042855""}, {""36180""}]",[],[],[],[],[],[],[],[]
"""Q100707809""","""100707809""","""Robert Cahaly""","""Robert_Cahaly""","""American polls…","""item""","[{""enwiki"",""Robert Cahaly"",""Robert_Cahaly""}, {""jawiki"",""ロバート・カヘリー"",""ロバート・カヘリー""}]","[{""en"",""American pollster and political consultant""}, {""zh"",""美国民调专家""}]","[{""en"",""Robert Cahaly""}, {""de"",""Robert Cahaly""}, … {""pt-br"",""Robert Cahaly""}]",[],"[{""5""}]","[{""6581097""}]",[],[],"[{""100704905""}]","[{""30""}]",[],"[{""1024426""}]","[{""16919156""}, {""8125919""}]",[],[],[],[],[],[],[],[]
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Q6128908""","""6128908""","""James Ashurst""","""James_Ashurst""","""English minist…","""item""","[{""enwiki"",""James Ashurst"",""James_Ashurst""}]","[{""en"",""English minister""}, {""nl"",""ambtenaar""}]","[{""en"",""James Ashurst""}, {""es"",""James Ashurst""}, … {""sq"",""James Ashurst""}]",[],"[{""5""}]","[{""6581097""}]","[{""+1605-01-01T00:00:00Z""}]","[{""+1605-01-01T00:00:00Z""}]",[],[],[],[],"[{""212238""}]",[],[],[],[],[],[],[],[]


Lets' look at the columns:

In [92]:
info_df.columns

['id',
 'numeric_id',
 'en_label',
 'en_wiki',
 'en_description',
 'type',
 'sitelinks',
 'descriptions',
 'labels',
 'aliases',
 'instance_of',
 'gender',
 'date_of_birth',
 'date_of_death',
 'worked_at',
 'country_of_citizenship',
 'country',
 'educated_at',
 'occupation',
 'instrument',
 'genre',
 'industry',
 'subclass_of',
 'coordinate_location',
 'iso_3166_alpha3',
 'member_of',
 'from_fictional_universe']

**Columns description** from the Wikidata Database (I've used the [Wikidata Property Explorer](https://prop-explorer.toolforge.org/)):
- `id`, `numeric_id` - entity identifiers.
- `en_wiki`, `en_label`, `en_description` - English wiki-name, label (human-readable name) and description of the entity.
- `type` - type of entity (item/property/etc. See https://www.wikidata.org/wiki/Wikidata:Identifiers).
- `sitelinks` - a list of *struct*s (dictionary-like data structure) containing site name, wiki-name, and label of the entity in all the wikis it is in.
- `descriptions`, `labels`, `aliases` - lists of structs containing site name, description of the entity in that site, label and aliases of entity in that site.
- `instance_of` - "that class[es] of which this subject is a particular example and member". In this dataset I've taken only entities that are an instance of `Human (Q5)`.
- `gender` - "sex or gender identity of human or animal. For human: male, female, non-binary, intersex, transgender female, transgender male, agender, etc."
- `date_of_birth`/`death` - "date on which the subject was born / died"
- `worked_at` - "location where persons or organisations were actively participating in employment, business or other work"
- `country_of_citizenship` - "the object is a country that recognizes the subject as its citizen"
- `country` - "sovereign state that this item is in ***(not to be used for human beings)***"
- `educated_at` - "educational institution attended by subject"
- `occupation` - "occupation of a person"
- `instrument` - "musical instrument that a person plays or teaches or used in a music occupation"
- `genre` - "creative work's genre or an artist's field of work"
- `industry` - "specific industry of company or organization"
- `subclass_of` - "this item is a subclass (subset) of that item; all instances of this item are instances of that item. different from P31 (instance of), e.g.: K2 is an instance of mountain; volcano is a subclass of mountain (and an instance of volcanic landform)"
- `coordinate_location` - "geocoordinates of the subject"
- `iso_3166_alpha3` - "identifier for a country in three-letter format"
- `member_of` - "organization, club or musical group to which the subject belongs"
- `from_fictional_universe` - "subject's fictional entity is in the object narrative".

### Cleaning the Data

Let's look at the columns and their data types.

In [3]:
print(dict(zip(info_df.columns, info_df.dtypes)))

One of the challenges in this dataset is the abundance of `List` and `Struct` data types. Meaning, each cell can have multiple values in it. This section will mostly focus on how to solve this problem.

✨ I shall treat the `List` columns differently than the "normal" columns when transforming and exploring the data.

**Questions for cleaning the data:**
1. Are there any duplicate rows?
2. Are there any columns that contain nothing but empty lists or null values? Are there columns that are *mostly* empty?
3. Are there any list/struct columns that always contain a single value (= lists that can be flattened)?
4. How many unique values does each column contains?

**Preliminary notes:**
* The values in `date_of_death` column are wrong (for some reason). I shall drop that column.
* According to [the Wikidata site](https://www.wikidata.org/wiki/Property:P17), this property should not be used for humans. I shall drop this columns as well.


---

* **Duplicate rows**

In [4]:
duplicated_df = info_df.filter(info_df['id'].is_duplicated()).sort('id')

print(f'There are {duplicated_df.height} rows carrying duplicate `id`s.')

In [5]:
(duplicated_df == duplicated_df.shift(-1))[::2].to_numpy().all()

True

✨ Insight: The rows carrying duplicate `id`s are completely equal in all columns. I will delete all the duplicate rows.

* **Empty columns**

In [56]:
list_columns = [c for c in info_df.columns if info_df[c].dtype == pl.List]

empty = []

for column in [c for c in info_df.columns if c not in list_columns]:
    empty.append([column,
                         (info_df[column]
                         .is_null()
                         .value_counts()
                         .filter(pl.col(column))
                         .select('count')
                         .to_numpy()).squeeze()])

for column in list_columns:
    empty.append([column,
                         ((info_df[column]
                         .list.len() == 0)
                         .value_counts()
                         .filter(pl.col(column))
                         .select('count')
                         .to_numpy()).squeeze()])

for item in empty:
    if not item[1].any():
        item[1] = 0
    item.append(item[1] / info_df.height * 100)

empty_df = (pd.DataFrame(empty, columns=['column', 'num_empty', 'percent_empty'])
            .sort_values('percent_empty', ascending=False)
            )
empty_df.style.format({'percent_empty': '{:.3f}%', 'num_empty': '{:,}'})

Unnamed: 0,column,num_empty,percent_empty
24,iso_3166_alpha3,1993448,100.000%
26,from_fictional_universe,1993443,100.000%
22,subclass_of,1993431,99.999%
23,coordinate_location,1993423,99.999%
21,industry,1993082,99.982%
16,country,1992209,99.938%
20,genre,1918158,96.223%
19,instrument,1904077,95.517%
25,member_of,1882243,94.421%
14,worked_at,1811036,90.849%


✨ Column `iso_3166_alpha3` is completely empty. I shall drop it.

✨ There are a few columns that are >90% empty. I will leave them there for now, as they might prove useful later on.




* **How many values in each list column?**

In [72]:
with pl.Config(tbl_rows=-1):
  display(info_df[list_columns]
          .with_columns([pl.col(c).list.n_unique() for c in list_columns])
          .describe()
          .filter(pl.col('statistic').is_in(['mean', 'std', 'min', '50%', 'max']))
          .transpose(include_header=True, column_names='statistic')
          .sort('max')
  )

column,mean,std,min,50%,max
str,f64,f64,f64,f64,f64
"""iso_3166_alpha…",0.0,0.0,0.0,0.0,0.0
"""from_fictional…",3e-06,0.001584,0.0,0.0,1.0
"""subclass_of""",9e-06,0.003167,0.0,0.0,2.0
"""coordinate_loc…",1.3e-05,0.003748,0.0,0.0,2.0
"""country""",0.000624,0.025093,0.0,0.0,3.0
"""gender""",1.00013,0.021377,0.0,1.0,4.0
"""industry""",0.000319,0.026319,0.0,0.0,4.0
"""date_of_birth""",0.942715,0.377177,0.0,1.0,8.0
"""date_of_death""",0.942715,0.377177,0.0,1.0,8.0
"""country_of_cit…",0.8589,0.564249,0.0,1.0,10.0


✨ The only list column that contains only one value is `from_fictional_universe`. I shall flatten it.

Other columns make sense to have more than one unique value per row, except for date of birth column. Are they simply duplicates?

In [89]:
dob_year_month = info_df['date_of_birth'].filter(info_df['date_of_birth'].list.len() > 1).list.eval(pl.element().struct[0].str.slice(1, 6))

print(f'Out of {dob_year_month.len():,} rows that have more than one date of birth,'
      f'{(dob_year_month.list.n_unique() == 1).sum() / dob_year_month.len() * 100:.2f}% '
      'have the same date of birth duplicated.')

✨ For our purposes, I'll simply take the first date of birth that appears in each row. 

* **Unique values**

In [53]:
unique_value_counts = []
for column in list_columns:
    unique_value_counts.append((column, info_df[column].list.explode().n_unique()))
for column in [c for c in info_df.columns if c not in list_columns]:
    unique_value_counts.append((column, info_df[column].n_unique()))
    
unique_df = (pd.DataFrame(unique_value_counts, columns=['column', 'unique_values'])
            .sort_values('unique_values')
            )
unique_df.style.format('{:,}', subset='unique_values')

Unnamed: 0,column,unique_values
26,type,1
18,iso_3166_alpha3,1
20,from_fictional_universe,6
16,subclass_of,19
17,coordinate_location,26
5,gender,45
15,industry,71
10,country,118
4,instance_of,357
13,instrument,767


✨ The column `type` contains only one value for all rows. I shall remove it.

Interesting points:
* apparently there are 45(!) different genders.
* Many columns have a lot of unique values.

✨ For columns with many unique values, I shall bin the less common one (replace them with *`other`*).

A summary table up until now:

In [57]:
pd.merge(unique_df, empty_df, on='column').style.format({'percent_empty': '{:.3f}%',
                                                         'num_empty': '{:,}',
                                                         'unique_values': '{:,}'})

Unnamed: 0,column,unique_values,num_empty,percent_empty
0,type,1,0,0.000%
1,iso_3166_alpha3,1,1993448,100.000%
2,from_fictional_universe,6,1993443,100.000%
3,subclass_of,19,1993431,99.999%
4,coordinate_location,26,1993423,99.999%
5,gender,45,302,0.015%
6,industry,71,1993082,99.982%
7,country,118,1992209,99.938%
8,instance_of,357,0,0.000%
9,instrument,767,1904077,95.517%


---

✨ **Observations:**

There are a few types of columns:
1. Redundant columns: columns that don't add information, either because they contain the same value throughout, have many missing values or do not contribute to our specific needs. The columns are:
    - `type` (same value for all rows)
    - `numeric_id` (contained in `id`)
    - `descriptions` (unneeded information)
    - `labels` (unneeded information)
    - `coordinate_location`(>99% empty)
    - `country` (according to the Wikidata site, this property should not be used for humans)
    - `date_of_death` (has wrong information)
    - `iso_3166_alpha3` (empty)

2. Columns with a single value per row: columns that (should) have zero or one values for each row. These columns are:
    - `id`
    - `en_label`
    - `en_wiki`
    - `en_description`
    - `date of birth`

3. Columns with *'interesting'* values per row: columns that contain zero, one or more values for each row, where every (major) value is *interesting* for our purpose. These columns are:
    - `instance_of`
    - `gender` (apparently an entity can have multiple genders)
    - `sitelinks`
    - `instrument`
    - `worked_at`
    - `country_of_citizenship`
    - `educated_at`
    - `occupation`
    - `genre`
    - `industry`
    - `subclass_of`
    - `member_of`
    - `from_fictional_universe`

4. Columns that have multiple values per row: columns that have multiple values per row, where an aggregation of these values is *interesting* for our purposes. These columns are:
    - `sitelinks`
    - `aliases` (We don't care what the aliases *are*, maybe just how many are there)
    - `instrument`
    - `worked_at`
    - `country_of_citizenship`
    - `educated_at`
    - `occupation`
    - `genre`
    - `industry`

 **Note** that a column might be a member of both the third group as well as the fourth group.

Handling strategy for each type of column:
1. Redundant columns --> drop columns.
2. Columns with a single value per row: --> leave as is (flatten if in a list or a struct).
3. Columns with *'interesting'* values per row --> encode into binary columns (bin uncommon values, or combine with more common ones) and drop original.
4. Columns that have multiple values per row --> create a column with the aggregate metric and drop original.

Types 1 and 2 are easy as the process is automatic. For types 3 and 4, we would have to look at the distribution of values in each of the columns. Before we can do that, though, it would be helpful to convert the codes in the dataset into human-readable labels. 

#### Converting Codes Into Labels

Many columns have 'numeric id's in them. These are numeric code to wikidata's property. To understand the data, we need to convert them to human-readable labels. I weill grab them straight from the wikidata website (the API has a small request limit, so I won't be using it).

First I'll get all the codes needed to be converted.

In [101]:
columns_with_codes = ['instance_of', 'gender', 'worked_at',
                      'country_of_citizenship', 'educated_at', 'occupation', 'instrument', 'genre', 'industry',
                      'subclass_of', 'member_of', 'from_fictional_universe']
unique_codes = [info_df[col].list.eval(pl.element().struct[0]).explode().unique().drop_nulls() for col in columns_with_codes]

In [102]:
sum([len(df) for df in unique_codes])

125026

In [123]:
import requests
from bs4 import BeautifulSoup
import time

In [112]:
def code_to_label(code):
    request = requests.get(f'https://www.wikidata.org/wiki/Q{code}')
    if request.status_code != 200:
        raise
    page = request.content
    soup = BeautifulSoup(page)
    label = soup.find('span', {'class': 'wikibase-title-label'}).text
    return label

In [113]:
code_to_label('47064')

'military personnel'

In [127]:
(PROCESSED_DATA_DIR / 'codes_labels').mkdir(exist_ok=True)

for s in unique_codes:
    code2labels = {}
    for code in tqdm(s):
        code2labels[code] = code_to_label(code)
    with open(PROCESSED_DATA_DIR / 'codes_labels' / f'{s.name}.json', 'w') as f:
        json.dump(code2labels, f)

# TODO: do the opposite, request only common codes
# TODO: utf-8?

100%|██████████| 357/357 [04:23<00:00,  1.35it/s]
100%|██████████| 44/44 [00:38<00:00,  1.14it/s]
  1%|          | 240/37768 [03:43<9:42:01,  1.07it/s] 


KeyboardInterrupt: 

---

With all that, let us clean the dataframe:

In [46]:
def clean_info_df(df: pl.DataFrame):
      columns_to_drop = ['type', 'numeric_id', 'descriptions', 'labels', 'iso_3166_alpha3', 'coordinate_location', 'country', 'iso_3166_alpha3']
      struct_columns = [c for c in df.columns if df[c].dtype == pl.List and c not in columns_to_drop]
      
     
      df = (df
            .filter(df['id'].is_first_distinct()) # Remove duplicate rows
            .drop(columns_to_drop) # drop redundant columns
            .with_columns([pl.col(col).list.eval(pl.element().struct[0]) for col in struct_columns]) # unnest structs
      )
      
      
      # columns_to_encode = [c for c in df.columns if df[c].dtype == pl.List and c not in
      #                         [*columns_to_drop, 'date_of_birth', 'date_of_death']]


      # unique_sitelinks = df['sitelinks'].explode().unique()

      # unique_parents = df['instance_of'].explode().unique()

      # sitelinks_columns = [pl.col('sitelinks').list.contains(site).alias(f'is_in_{site}') for site in unique_sitelinks]

      # parents_columns = [pl.col('instance_of').list.contains(parent).alias(f'is_instance_of_{parent}') for parent in unique_parents]



      # df = (df
      #       .with_columns([
      #       *sitelinks_columns,
      #       pl.col('aliases').list.len().alias('num_aliases'),
      #       *parents_columns,
      #       ]))

      return df


id,numeric_id,en_label,en_wiki,en_description,sitelinks,aliases,instance_of,gender,date_of_birth,date_of_death,worked_at,country_of_citizenship,country,educated_at,occupation,instrument,genre,industry,subclass_of,coordinate_location,member_of,from_fictional_universe
str,str,str,str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[f64],list[str],list[str]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""dewiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""enwiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""frwiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""arwiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
"""Q17122148""","""17122148""","""Albrecht Josep…","""Albrecht_Josep…","""German screenw…","""arzwiki""","[""de"", ""en"", … ""sk""]","[""5""]","[""6581097""]","[""+1901-11-20T00:00:00Z""]","[""+1901-11-20T00:00:00Z""]",[],"[""183""]",[],[],"[""28389"", ""7042855"", ""36180""]",[],[],[],[],[],[],[]
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""enwiki""","[""es"", ""es"", … ""fr""]","[""5""]","[""6581097""]","[""+1978-10-11T00:00:00Z""]","[""+1978-10-11T00:00:00Z""]",[],"[""155""]",[],[],"[""937857""]",[],[],[],[],[],[],[]
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""arzwiki""","[""es"", ""es"", … ""fr""]","[""5""]","[""6581097""]","[""+1978-10-11T00:00:00Z""]","[""+1978-10-11T00:00:00Z""]",[],"[""155""]",[],[],"[""937857""]",[],[],[],[],[],[],[]
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""ptwiki""","[""es"", ""es"", … ""fr""]","[""5""]","[""6581097""]","[""+1978-10-11T00:00:00Z""]","[""+1978-10-11T00:00:00Z""]",[],"[""155""]",[],[],"[""937857""]",[],[],[],[],[],[],[]
"""Q521785""","""521785""","""Mauro Sérgio V…","""Maurinho_(foot…","""Brazilian foot…","""arwiki""","[""es"", ""es"", … ""fr""]","[""5""]","[""6581097""]","[""+1978-10-11T00:00:00Z""]","[""+1978-10-11T00:00:00Z""]",[],"[""155""]",[],[],"[""937857""]",[],[],[],[],[],[],[]


In [96]:
info_df.write_parquet(PROCESSED_DATA_DIR / 'wikidata_all.parquet')

Aliases not included in the pageviews data

In [60]:
(info_df['numeric_id'] == info_df['id'].str.replace('Q', '')).all()

True

Convert code to label description:
- https://stackoverflow.com/questions/59737076/how-to-get-a-label-of-a-property-from-wikidata
- https://stackoverflow.com/questions/72704205/how-to-convert-wiki-data-qid-to-entity-and-vice-versa-in-python

1. Column-specific questions:
    1. Can an entity have more than one `gender`? Can an entity have no `gender`? If so - why?
    2. Can an entity have no `date_of_birth`? why?


### Additional Features Extraction

In [None]:
info_df.filter(info_df['en_description'].str.starts_with('fictional'))

id,numeric_id,en_label,en_wiki,en_description,type,sitelinks,descriptions,labels,aliases,instance_of,gender,date_of_birth,date_of_death,worked_at,country_of_citizenship,country,educated_at,occupation,instrument,genre,industry,subclass_of,coordinate_location,iso_3166_alpha3,member_of,from_fictional_universe
str,str,str,str,str,str,list[struct[3]],list[struct[2]],list[struct[2]],list[struct[2]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[1]],list[struct[2]],list[null],list[struct[1]],list[struct[1]]
"""Q16209139""","""16209139""","""Norman Gunston…","""Norman_Gunston…","""fictional char…","""item""","[{""enwiki"",""Norman Gunston"",""Norman_Gunston""}]","[{""nl"",""zanger""}, {""en"",""fictional character""}, {""uk"",""вигаданий персонаж""}]","[{""en"",""Norman Gunston""}, {""de"",""Norman Gunston""}, … {""sq"",""Norman Gunston""}]",[],"[{""5""}]","[{""6581097""}]",[],[],[],[],[],[],"[{""177220""}]","[{""17172850""}]",[],[],[],[],[],[],[]
"""Q121026451""","""121026451""","""Mae Paner""","""Mae_Paner""","""fictional char…","""item""","[{""enwiki"",""Mae Paner"",""Mae_Paner""}]","[{""en"",""fictional character""}, {""uk"",""вигаданий персонаж""}]","[{""en"",""Mae Paner""}, {""de"",""Mae Paner""}, … {""pt-br"",""Mae Paner""}]",[],"[{""5""}]","[{""6581072""}]",[],[],[],[],[],[],"[{""2259451""}, {""1476215""}]",[],[],[],[],[],[],[],[]
"""Q5460334""","""5460334""","""Flora Malherbe…","""Flora_Malherbe…","""fictional char…","""item""","[{""enwiki"",""Flora Malherbe"",""Flora_Malherbe""}]","[{""nl"",""monteur""}, {""en"",""fictional character on the American TV sitcom The Andy Griffith Show and its successor, Mayberry RFD""}]","[{""en"",""Flora Malherbe""}, {""es"",""Flora Malherbe""}, {""ast"",""Flora Malherbe""}]",[],"[{""5""}]","[{""6581072""}]",[],[],[],[],[],[],"[{""327029""}]",[],[],[],[],[],[],[],[]
"""Q4751616""","""4751616""","""Anaranya""","""Anaranya""","""fictional char…","""item""","[{""enwiki"",""Anaranya"",""Anaranya""}, {""ptwiki"",""Anaranya"",""Anaranya""}, … {""ruwiki"",""Анаранья"",""Анаранья""}]","[{""bn"",""হিন্দু পৌরাণিক চরিত্র""}, {""en"",""fictional character""}, {""uk"",""вигаданий персонаж""}]","[{""en"",""Anaranya""}, {""pt"",""Anaranya""}, … {""ru"",""Анаранья""}]",[],"[{""5""}]","[{""6581097""}]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
"""Q7844223""","""7844223""","""Tristram the Y…","""Tristan_the_Yo…","""fictional char…","""item""","[{""enwiki"",""Tristan the Younger"",""Tristan_the_Younger""}]","[{""en"",""fictional character in Arthurian romances""}]","[{""en"",""Tristram the Younger""}, {""ast"",""Tristram the Younger""}]",[],"[{""5""}]","[{""6581097""}]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Q7174933""","""7174933""","""Peter Jairus F…","""Peter_Jairus_F…","""fictionalized …","""item""","[{""enwiki"",""Peter Jairus Frigate"",""Peter_Jairus_Frigate""}]","[{""en"",""fictionalized version of the science fiction author Philip José Farmer in Riverworld series""}]","[{""en"",""Peter Jairus Frigate""}, {""de"",""Peter Jairus Frigate""}, … {""ast"",""Peter Jairus Frigate""}]",[],"[{""5""}]","[{""6581097""}]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
"""Q26409637""","""26409637""","""Amy Teo""","""Amy_Teo""","""fictional char…","""item""","[{""enwiki"",""Amy Teo"",""Amy_Teo""}]","[{""en"",""fictional character from the BBC medical drama Holby City""}]","[{""en"",""Amy Teo""}, {""nl"",""Amy Teo""}, … {""ast"",""Amy Teo""}]",[],"[{""5""}]","[{""6581072""}]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
"""Q10788162""","""10788162""","""Lý Ông Trọng""","""Ruan_Wengzhong…","""fictional char…","""item""","[{""viwiki"",""Lý Ông Trọng"",""Lý_Ông_Trọng""}, {""zhwiki"",""阮翁仲"",""阮翁仲""}, … {""thwiki"",""รฺเหวี่ยน เวิงจ้ง"",""รฺเหวี่ยน_เวิงจ้ง""}]","[{""en"",""fictional character""}, {""vi"",""nhân vật hư cấu""}, {""uk"",""вигаданий персонаж""}]","[{""vi"",""Lý Ông Trọng""}, {""zh"",""阮翁仲""}, … {""th"",""รฺเหวี่ยน เวิงจ้ง""}]","[{""vi"",""Lý Thân""}, {""vi"",""Uy mãnh Oanh liệt Phụ tín Đại vương""}, … {""vi"",""Vạn Tín Hầu""}]","[{""5""}]","[{""6581097""}]",[],[],[],"[{""7183""}]",[],[],[],[],[],[],[],[],[],[],[]
"""Q84607917""","""84607917""","""Anduin Wrynn""","""Anduin_Wrynn""","""fictional char…","""item""","[{""enwiki"",""Anduin Wrynn"",""Anduin_Wrynn""}, {""ukwiki"",""Андуїн Рінн"",""Андуїн_Рінн""}]","[{""en"",""fictional character who appears in the Warcraft series of video games by Blizzard Entertainment.""}, {""nl"",""personage uit Warcraft""}, {""it"",""personaggio di Warcraft""}]","[{""en"",""Anduin Wrynn""}, {""nl"",""Anduin Wrynn""}, … {""uk"",""Андуін Рінн""}]",[],"[{""15632617""}, {""5""}]","[{""6581097""}]",[],[],[],[],[],[],"[{""16658189""}]",[],[],[],[],[],[],[],"[{""26181639""}]"


# Time Series Clustering

In [None]:
views_df = pl.read_parquet(PROCESSED_DATA_DIR / 'pageviews_all.parquet')

In [98]:
views_df[:30]

title,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10,2015-11,2015-12,2016-01,2016-02,2016-03,2016-04,2016-05,2016-06,2016-07,2016-08,2016-09,2016-10,2016-11,2016-12,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04,…,2020-12,2021-01,2021-02,2021-03,2021-04,2021-05,2021-06,2021-07,2021-08,2021-09,2021-10,2021-11,2021-12,2022-01,2022-02,2022-03,2022-04,2022-05,2022-06,2022-07,2022-08,2022-09,2022-10,2022-11,2022-12,2023-01,2023-02,2023-03,2023-04,2023-05,2023-06,2023-07,2023-08,2023-09,2023-10,2023-11,2023-12
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""Neophyte_II_of…",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,14.0,17.0,26.0,10.0,17.0,24.0,29.0,32.0,51.0,20.0,36.0,46.0,32.0,119.0,27.0,32.0,18.0,26.0,22.0,30.0,30.0,26.0,35,33,33,26,25,24,29,44.0,23,25
"""Olivia_Giovett…",26.0,34.0,52.0,20.0,41.0,40.0,19.0,21.0,27.0,23.0,32.0,21.0,32.0,29.0,57.0,47.0,31.0,21.0,24.0,15.0,42.0,17.0,45.0,30.0,25.0,34.0,36.0,22.0,22.0,22.0,16.0,28.0,30.0,34.0,22.0,19.0,…,39.0,30.0,32.0,34.0,37.0,29.0,14.0,26.0,24.0,36.0,39.0,35.0,20.0,63.0,33.0,39.0,19.0,19.0,30.0,20.0,14.0,21.0,35.0,52.0,25.0,25.0,33.0,20,18,22,31,26,16,20,59.0,41,19
"""Alexander_Gran…",9.0,48.0,24.0,23.0,32.0,24.0,12.0,20.0,19.0,7.0,8.0,14.0,18.0,27.0,11.0,23.0,12.0,19.0,15.0,17.0,11.0,13.0,26.0,14.0,16.0,18.0,13.0,20.0,13.0,26.0,16.0,19.0,32.0,14.0,18.0,12.0,…,5.0,13.0,9.0,12.0,11.0,9.0,14.0,19.0,9.0,7.0,5.0,11.0,4.0,11.0,9.0,7.0,9.0,9.0,4.0,5.0,12.0,72.0,6.0,10.0,10.0,10.0,2.0,7,10,7,2,7,4,9,10.0,1,5
"""Bernard_H._Rae…",17.0,16.0,21.0,20.0,19.0,18.0,10.0,9.0,17.0,5.0,8.0,10.0,5.0,28.0,19.0,11.0,17.0,14.0,13.0,19.0,16.0,13.0,24.0,18.0,10.0,10.0,14.0,13.0,14.0,19.0,5.0,15.0,26.0,12.0,13.0,10.0,…,2.0,1.0,4.0,1.0,5.0,5.0,3.0,5.0,2.0,2.0,3.0,4.0,,12.0,4.0,3.0,2.0,1.0,4.0,,1.0,3.0,8.0,6.0,3.0,1.0,5.0,1,3,5,7,3,1,2,,1,2
"""Leigh_Magar""",47.0,46.0,64.0,36.0,56.0,45.0,41.0,26.0,30.0,48.0,15.0,26.0,19.0,38.0,28.0,38.0,37.0,51.0,24.0,36.0,17.0,15.0,45.0,39.0,26.0,22.0,21.0,14.0,19.0,23.0,16.0,32.0,36.0,21.0,18.0,20.0,…,8.0,8.0,9.0,13.0,6.0,3.0,12.0,13.0,12.0,5.0,8.0,8.0,11.0,3.0,9.0,8.0,13.0,13.0,9.0,20.0,9.0,8.0,14.0,27.0,14.0,13.0,20.0,13,10,9,5,9,9,8,5.0,10,2
"""Carolyn_Stanfo…",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,221.0,294.0,359.0,286.0,650.0,202.0,170.0,169.0,83.0,97.0,107.0,112.0,70.0,78.0,88.0,64.0,54.0,78.0,84.0,74.0,131.0,74.0,86.0,180.0,105.0,75.0,76.0,73,105,56,50,89,86,59,58.0,50,42
"""Boncho_Novakov…",16.0,18.0,26.0,19.0,21.0,16.0,11.0,14.0,18.0,6.0,6.0,15.0,4.0,10.0,10.0,15.0,14.0,12.0,11.0,18.0,14.0,4.0,22.0,13.0,15.0,16.0,8.0,13.0,11.0,8.0,7.0,14.0,18.0,9.0,7.0,11.0,…,4.0,7.0,5.0,3.0,18.0,39.0,5.0,3.0,5.0,5.0,2.0,4.0,5.0,3.0,5.0,5.0,4.0,6.0,7.0,3.0,4.0,5.0,5.0,5.0,3.0,9.0,2.0,3,4,4,3,7,9,2,3.0,2,3
"""Margaret_Hodge…",5665.0,2932.0,2420.0,2482.0,3097.0,2693.0,1955.0,1867.0,3365.0,2059.0,1808.0,3435.0,4174.0,12893.0,4331.0,4311.0,3434.0,2573.0,2110.0,1829.0,1813.0,1953.0,1667.0,2523.0,2442.0,3777.0,1705.0,1577.0,5159.0,1650.0,3699.0,1623.0,2057.0,1368.0,2277.0,2749.0,…,2260.0,1819.0,2069.0,1605.0,1630.0,1890.0,2082.0,1391.0,1126.0,1351.0,2217.0,1430.0,4400.0,1788.0,2612.0,1680.0,1471.0,1743.0,1336.0,2097.0,1272.0,1918.0,2196.0,1194.0,1204.0,1536.0,2234.0,1909,1311,1261,3170,2097,1026,1335,5767.0,2327,3310
"""Carmen_Belén_R…",,,,,,25.0,26.0,31.0,27.0,60.0,35.0,32.0,29.0,20.0,20.0,51.0,23.0,31.0,37.0,31.0,59.0,46.0,51.0,35.0,28.0,31.0,34.0,34.0,25.0,42.0,27.0,39.0,118.0,56.0,36.0,46.0,…,17.0,24.0,56.0,80.0,25.0,32.0,31.0,13.0,24.0,25.0,18.0,27.0,27.0,18.0,46.0,54.0,20.0,24.0,30.0,12.0,14.0,12.0,25.0,27.0,10.0,31.0,36.0,63,29,12,10,15,36,20,15.0,20,17
"""M._Satish_Redd…",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32.0,49.0,104.0,102.0,84.0,138.0,277.0,…,111.0,152.0,78.0,79.0,62.0,590.0,93.0,119.0,317.0,128.0,66.0,63.0,99.0,78.0,79.0,104.0,109.0,121.0,200.0,129.0,144.0,156.0,121.0,158.0,266.0,233.0,280.0,313,517,1188,155,119,136,128,90.0,124,158


In [118]:
X_train = views_df.select([col for col in views_df.columns if col != 'title']).fill_null(0).to_numpy()
X_train

array([[ 0,  0,  0, ..., 44, 23, 25],
       [26, 34, 52, ..., 59, 41, 19],
       [ 9, 48, 24, ..., 10,  1,  5],
       ...,
       [40, 29, 34, ...,  3,  1,  2],
       [23, 52, 38, ...,  8,  7,  6],
       [ 0,  0,  0, ..., 16,  8,  7]])

In [119]:
X_train.shape

(1976658, 104)

In [120]:
import numpy
import matplotlib.pyplot as plt

from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, \
    TimeSeriesResampler

seed = 0
numpy.random.seed(seed)

sz = X_train.shape[1]

X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train)

# Euclidean k-means
print("Euclidean k-means")
km = TimeSeriesKMeans(n_clusters=3, verbose=True, random_state=seed,
n_jobs=-1)
y_pred = km.fit_predict(X_train)

plt.figure()
for yi in range(3):
    plt.subplot(3, 3, yi + 1)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Euclidean $k$-means")

# DBA-k-means
print("DBA k-means")
dba_km = TimeSeriesKMeans(n_clusters=3,
                          n_init=2,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10,
                          random_state=seed,
                          n_jobs=-1)
y_pred = dba_km.fit_predict(X_train)

for yi in range(3):
    plt.subplot(3, 3, 4 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("DBA $k$-means")


plt.tight_layout()
plt.show()

143.854 --> 76.592 --> 

KeyboardInterrupt: 

In [112]:
km.inertia_

70.60124743054607

In [114]:
y_pred

array([2, 1, 1, 0, 0, 2, 0, 1, 2, 1, 2, 2, 0, 0, 2, 0, 0, 1, 1, 2, 0, 1,
       2, 2, 0, 0, 1, 0, 0, 2, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 2, 0, 1, 0, 2, 1, 0, 2, 0, 2, 0, 1, 2,
       2, 2, 0, 0, 1, 1, 2, 2, 1, 1, 0, 1, 1, 0, 2, 0, 2, 0, 2, 1, 2, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 2, 2, 2])

In [115]:
y = views_df[:100].with_columns(pl.Series(name='cluster', values=y_pred))

https://github.com/aeon-toolkit/aeon/blob/4a58c4dba5bb37e43979368ad9cf092540e71786/examples/segmentation/segmentation_with_clasp.ipynb