In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import json
# import matplotlib.pyplot as plt
# import shap

In [21]:
df = pd.read_parquet("../data/interim/dfi_ensemble_2022-07-01_2025-10-01_V.1.0.parquet")

# DF partition to simulate data drift in the future

In [22]:
df.columns

Index(['reporting_month', 'ISRC', 'spotify', 'release_type', 'continent',
       'zone', 'quantity', 'unit_price', 'mechanical_fee', 'share_rate',
       'revenue'],
      dtype='object')

In [23]:
print(df['reporting_month'].min())
print(df['reporting_month'].max())

2022/09/01
2025/10/01


## DFI Drift

In [24]:
dfi_drift = df[df['reporting_month'] >= '2023/10/01']

print(dfi_drift['reporting_month'].min())
print(dfi_drift['reporting_month'].max())

2023/10/01
2025/10/01


In [28]:
dfi_drift.to_parquet(
    "../data/interim/dfi_2023-10-01_2025-10-01_V.1.0.parquet",
    engine="pyarrow",   # recomendado
    index=False
)

## DFI

In [25]:
dfi = df[(df['reporting_month'] >= '2023/10/01') & (df['reporting_month'] < '2025/01/01')]

print(dfi['reporting_month'].min())
print(dfi['reporting_month'].max())

2023/10/01
2024/12/01


In [30]:
dfi.to_parquet(
    "../data/interim/dfi_2023-10-01_2024-12-01_V.1.0.parquet",
    engine="pyarrow",   # recomendado
    index=False
)

# Filter Spotify and music release only

In [26]:
dfi['spotify'].unique()

array(['Spotify', 'Other'], dtype=object)

**DROP COLUMNS IN DFI**

In [27]:
dfp = dfi[(dfi['spotify'] == 'Spotify') & (dfi['release_type'] == 'Music Release')].reset_index(drop = True)
dfp = dfp.drop(columns = ["spotify", "release_type", "unit_price", "mechanical_fee", "share_rate"])

print(dfp['reporting_month'].min())
print(dfp['reporting_month'].max())

2023/10/01
2024/12/01


**DROP COLUMNS IN DFI Drift**

In [28]:
dfp_drift = dfi_drift[(dfi_drift['spotify'] == 'Spotify') & (dfi_drift['release_type'] == 'Music Release')].reset_index(drop = True)
dfp_drift = dfp_drift.drop(columns = ["spotify", "release_type", "unit_price", "mechanical_fee", "share_rate"])

print(dfp_drift['reporting_month'].min())
print(dfp_drift['reporting_month'].max())

2023/10/01
2025/10/01


## Check columns that may have unseen values in test

In [29]:
for col in dfp.columns:
    unique_values_total = dfi[col].unique()
    
    unique_values_train = dfp.iloc[:-int(len(dfp)*0.2)][col].unique()
    unique_values_test = dfp.iloc[-int(len(dfp)*0.2):][col].unique()
    
    print(f"Unique values for {col}:")
    print(f"-> {len(unique_values_test)} over {len(unique_values_total)}")
    
    if len(set(unique_values_test) - set(unique_values_train)) == 0:
        print("All seen\n")
    else:
        print(f"NOT all seen. Unseen: {len(set(unique_values_test) - set(unique_values_train))}\n")

Unique values for reporting_month:
-> 3 over 15
NOT all seen. Unseen: 2

Unique values for ISRC:
-> 255 over 1363
NOT all seen. Unseen: 12

Unique values for continent:
-> 7 over 7
All seen

Unique values for zone:
-> 144 over 223
NOT all seen. Unseen: 2

Unique values for quantity:
-> 1651 over 5779
NOT all seen. Unseen: 432

Unique values for revenue:
-> 24583 over 275980
NOT all seen. Unseen: 19294



In [30]:
print(len(set(unique_values_test) - set(unique_values_train)))
print(len(set(unique_values_train) - set(unique_values_test)))

19294
79263


## Split data + add unknown values

In [31]:
dfp_train = dfp.iloc[:-int(len(dfp)*0.2)].copy()
dfp_test = dfp.iloc[-int(len(dfp)*0.2):].copy()

In [14]:
n_replace = int(len(dfp_train) * 0.05)

random_index_isrc = dfp_train.sample(n=n_replace, random_state=42).index
random_index_zone = dfp_train.sample(n=n_replace, random_state=43).index

dfp_train.loc[random_index_isrc, 'ISRC'] = "UNKNOWN"
dfp_train.loc[random_index_zone, 'zone'] = "UNKNOWN"

In [15]:
dfp_train.head()

Unnamed: 0,reporting_month,ISRC,continent,zone,quantity,revenue
0,2023/10/01,GB-LFP-20-86366,Europe,Belarus,13,7.64771e-07
1,2023/10/01,FR-2X4-23-79996,Africa,Namibia,3,7.64924e-07
2,2023/10/01,FR-X20-23-35046,Asia,Nepal,3,7.64924e-07
3,2023/10/01,CA-5KR-21-69877,Europe,Malta,3,7.64924e-07
4,2023/10/01,CA-5KR-21-25668,Africa,Tunisia,3,7.64924e-07


## FIX (adding UNKNOWN values)

In [32]:
json_train = {}
for col in ["reporting_month", "ISRC", "zone", "continent"]:
    json_train[col] = sorted(dfp_train[col].unique())

### Fixing dfp_test

In [33]:
for col in list(json_train.keys())[1:]:
    mask = ~dfp_test[col].isin(json_train[col])
    n_unknown = mask.sum()
    print(f"Column '{col}': {n_unknown} unknown elements ({n_unknown/len(df)*100:.2f}%)")
    dfp_test.loc[mask, col] = "UNKNOWN"

Column 'ISRC': 1204 unknown elements (0.13%)
Column 'zone': 2 unknown elements (0.00%)
Column 'continent': 0 unknown elements (0.00%)


### Fixing dfp_drift

In [34]:
for col in list(json_train.keys())[1:]:
    mask = ~dfp_drift[col].isin(json_train[col])
    n_unknown = mask.sum()
    print(f"Column '{col}': {n_unknown} unknown elements ({n_unknown/len(df)*100:.2f}%)")
    dfp_drift.loc[mask, col] = "UNKNOWN"

Column 'ISRC': 37423 unknown elements (4.02%)
Column 'zone': 53 unknown elements (0.01%)
Column 'continent': 0 unknown elements (0.00%)


In [118]:
print(dfp_train['reporting_month'].min(), dfp_train['reporting_month'].max())
print(dfp_test['reporting_month'].min(), dfp_test['reporting_month'].max())

2023/10/01 2024/10/01
2024/10/01 2024/12/01


## Save data

In [120]:
output_path = "../data/processed/iter1/train/dfp_2023-10-01_2024-10-01_V.1.0.parquet"
os.makedirs(os.path.dirname(output_path), exist_ok=True) 

dfp_train.to_parquet(
    output_path,
    engine="pyarrow",
    index=False
)

In [35]:
# We also save JSON file with the unique values shown on each column
with open(output_path.replace('.parquet', '.json'), 'w') as f:
    json.dump(json_train, f, indent=2)

In [121]:
output_path = "../data/processed/iter1/test/dfp_2024-10-01_2024-12-01_V.1.0.parquet"
os.makedirs(os.path.dirname(output_path), exist_ok=True) 

dfp_test.to_parquet(
    output_path,
    engine="pyarrow",   # recomendado
    index=False
)

In [127]:
output_path = "../data/processed/iter1/drift/dfp_2023-10-01_2025-10-01_V.1.0.parquet"
os.makedirs(os.path.dirname(output_path), exist_ok=True) 

dfp_drift.to_parquet(
    output_path,
    engine="pyarrow",   # recomendado
    index=False
)