# Imports and Data

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTENC

# Standard Data

In [11]:
df = pd.read_csv('Data/LFM-1b-DemoBiasSub-10k.csv')
df

Unnamed: 0,user_id,artist_id,track_id,play_count,country,age,gender,registered_unixtime
0,384,362,16567,16,UK,35,m,1035849600
1,384,2000,12303,9,UK,35,m,1035849600
2,384,2000,12308,40,UK,35,m,1035849600
3,384,2583,151308,15,UK,35,m,1035849600
4,384,2583,151314,43,UK,35,m,1035849600
...,...,...,...,...,...,...,...,...
350185,50871714,1893,1606576,7,BY,19,f,1342728447
350186,50871714,12189,992547,5,BY,19,f,1342728447
350187,50871714,48241,791386,5,BY,19,f,1342728447
350188,50871714,49735,3561201,5,BY,19,f,1342728447


In [12]:
# Gender distribution
print("Original Gender Distribution:")
print(df['gender'].value_counts())

Original Gender Distribution:
gender
m    265274
f     84916
Name: count, dtype: int64


# Resampling

In [14]:
# Separate majority and minority
df_m = df[df.gender == 'm']
df_f = df[df.gender == 'f']

n_majority = len(df_m)

# Oversample minority by duplication
df_minority_upsampled = df_f.sample(
    n=n_majority,
    replace=True,
    random_state=42
)

# Combine
df_resampled = pd.concat([df_m, df_minority_upsampled], axis=0).reset_index(drop=True)

df_resampled

Unnamed: 0,user_id,artist_id,track_id,play_count,country,age,gender,registered_unixtime
0,384,362,16567,16,UK,35,m,1035849600
1,384,2000,12303,9,UK,35,m,1035849600
2,384,2000,12308,40,UK,35,m,1035849600
3,384,2583,151308,15,UK,35,m,1035849600
4,384,2583,151314,43,UK,35,m,1035849600
...,...,...,...,...,...,...,...,...
530543,38677194,8530,3474835,7,TR,23,f,1302726750
530544,29692019,4115,29977,11,PL,-1,f,1274029323
530545,30300446,6122,54691,6,PL,19,f,1276098611
530546,13699564,3333,22976,53,UK,-1,f,1226618938


In [15]:
print("Resampling Gender Distribution:")
print(df_resampled['gender'].value_counts())

Resampling Gender Distribution:
gender
m    265274
f    265274
Name: count, dtype: int64


In [None]:
# Save the resampled dataset
df_resampled.to_csv('Data/LFM-1b-DemoBiasSub-10k-Resampled.csv', index=False)

# SMOTE

In [15]:
# Define the identifier columns that should not be oversampled.
identifier_cols = ['user_id', 'track_id']

# For oversampling, drop the identifier columns.
X_features = df.drop(columns=identifier_cols + ['gender']).copy()
y = df['gender']

# Identify categorical features among X_features (if any)
cat_features = X_features.select_dtypes(include=['object']).columns.tolist()

# Label encode categorical features
encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X_features[col] = le.fit_transform(X_features[col])
    encoders[col] = le

# Get indices of these categorical features in X_features
categorical_indices = [X_features.columns.get_loc(col) for col in cat_features]

In [16]:
# Create SMOTENC instance without using nn_kwargs.
smotenc = SMOTENC(
    categorical_features=categorical_indices,
    random_state=42,
    n_jobs=-1,         # Use all available CPU cores, if supported.
    k_neighbors=5      # Adjust neighbors as needed.
)

# Perform oversampling on the feature set only.
X_features_res, y_res = smotenc.fit_resample(X_features, y)

# Convert the oversampled features back into a DataFrame.
df_features_res = pd.DataFrame(X_features_res, columns=X_features.columns)

# Prepare placeholder columns for identifiers; we will attempt to recover original identifier values.
df_features_res['user_id'] = np.nan
df_features_res['track_id'] = np.nan

# Prepare a DataFrame of the original features with identifier columns for matching.
original_features = X_features.copy()
original_features['user_id'] = df['user_id'].values
original_features['track_id'] = df['track_id'].values



In [17]:
# Merge the resampled data with the original identifiers by matching on the oversampled feature columns.
# Only rows that exactly match an original row will have their identifiers recovered.
merged = df_features_res.merge(
    original_features,
    on=list(X_features.columns),
    how='left',
    suffixes=('', '_orig')
)

# Use the original identifiers where available.
merged['user_id'] = merged['user_id'].combine_first(merged['user_id_orig'])
merged['track_id'] = merged['track_id'].combine_first(merged['track_id_orig'])
merged.drop(columns=['user_id_orig', 'track_id_orig'], inplace=True)

# Reconstruct the final DataFrame by attaching the oversampled target.
df_SMOTE = pd.concat([merged, pd.Series(y_res, name='gender').reset_index(drop=True)], axis=1)
df_SMOTE

Unnamed: 0,artist_id,play_count,country,age,registered_unixtime,user_id,track_id,gender
0,362,16,174,35,1035849600,384.0,16567.0,m
1,2000,9,174,35,1035849600,384.0,12303.0,m
2,2000,40,174,35,1035849600,384.0,12308.0,m
3,2583,15,174,35,1035849600,384.0,151308.0,m
4,2583,43,174,35,1035849600,384.0,151314.0,m
...,...,...,...,...,...,...,...,...
547658,695,6,123,28,1201201050,,,
547659,10374,5,10,22,1334171853,,,
547660,7738,11,147,18,1239713723,,,
547661,3082,20,73,22,1266514222,,,


In [18]:
print("SMOTE Gender Distribution:")
print(df_SMOTE['gender'].value_counts())

SMOTE Gender Distribution:
gender
m    265274
f    265274
Name: count, dtype: int64


In [None]:
# Save the debiased dataset
df_SMOTE.to_csv('Data/LFM-1b-DemoBiasSub-10k-SMOTE.csv', index=False)