# Imports and Data

In [4]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

# Standard Data

In [3]:
df = pd.read_csv('Data/LFM-1b-DemoBiasSub-10k.csv')
df

Unnamed: 0,user_id,artist_id,track_id,play_count,country,age,gender,registered_unixtime
0,384,362,16567,16,UK,35,m,1035849600
1,384,2000,12303,9,UK,35,m,1035849600
2,384,2000,12308,40,UK,35,m,1035849600
3,384,2583,151308,15,UK,35,m,1035849600
4,384,2583,151314,43,UK,35,m,1035849600
...,...,...,...,...,...,...,...,...
350185,50871714,1893,1606576,7,BY,19,f,1342728447
350186,50871714,12189,992547,5,BY,19,f,1342728447
350187,50871714,48241,791386,5,BY,19,f,1342728447
350188,50871714,49735,3561201,5,BY,19,f,1342728447


In [5]:
# Gender distribution
print("Original Gender Distribution:")
print(df['gender'].value_counts())

Original Gender Distribution:
gender
m    265274
f     84916
Name: count, dtype: int64


# Resampling

In [14]:
# Separate majority and minority
df_m = df[df.gender == 'm']
df_f = df[df.gender == 'f']

n_majority = len(df_m)

# Oversample minority by duplication
df_minority_upsampled = df_f.sample(
    n=n_majority,
    replace=True,
    random_state=42
)

# Combine
df_resampled = pd.concat([df_m, df_minority_upsampled], axis=0).reset_index(drop=True)

df_resampled

Unnamed: 0,user_id,artist_id,track_id,play_count,country,age,gender,registered_unixtime
0,384,362,16567,16,UK,35,m,1035849600
1,384,2000,12303,9,UK,35,m,1035849600
2,384,2000,12308,40,UK,35,m,1035849600
3,384,2583,151308,15,UK,35,m,1035849600
4,384,2583,151314,43,UK,35,m,1035849600
...,...,...,...,...,...,...,...,...
530543,38677194,8530,3474835,7,TR,23,f,1302726750
530544,29692019,4115,29977,11,PL,-1,f,1274029323
530545,30300446,6122,54691,6,PL,19,f,1276098611
530546,13699564,3333,22976,53,UK,-1,f,1226618938


In [15]:
print("Resampling Gender Distribution:")
print(df_resampled['gender'].value_counts())

Resampling Gender Distribution:
gender
m    265274
f    265274
Name: count, dtype: int64


In [16]:
# Save the resampled dataset
df_resampled.to_csv('Data/LFM-1b-DemoBiasSub-10k-Resampled.csv', index=False)

# SMOTE

In [7]:
# We want to treat all non-numeric features (except the target 'gender') as categorical.
# Get all columns of type object.
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
# Exclude the target column if it is object type.
cat_features = [col for col in categorical_cols if col != 'gender']

# Save the original unique values for each categorical column (for documentation purposes)
cat_maps = {}
for col in cat_features:
    cat_maps[col] = df[col].unique()

In [8]:
# One-Hot Encode the Features for SMOTE
# Separate features and target
X_orig = df.drop('gender', axis=1)
y = df['gender']

# Apply one-hot encoding for all categorical features.
# We use drop_first=False so that we have a complete set of dummy columns to later reconstruct the original data.
X_encoded = pd.get_dummies(X_orig, columns=cat_features, drop_first=False)

In [9]:
# Apply SMOTE to Oversample the Minority Class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

# Convert back to DataFrame/Series to ease further processing.
X_resampled = pd.DataFrame(X_resampled, columns=X_encoded.columns)
y_resampled = pd.Series(y_resampled, name='gender')

In [None]:
# Convert the One-Hot Encoded Categorical Features Back to Original Format
X_final = X_resampled.copy()

for col in cat_features:
    # Identify all dummy columns corresponding to the current categorical feature.
    # They should be named like 'column_value'.
    dummy_cols = [c for c in X_resampled.columns if c.startswith(col + "_")]
    if dummy_cols:
        # Retrieve the values for these dummy columns as a NumPy array.
        dummy_array = X_resampled[dummy_cols].values
        # For each row, find the index of the highest value among the dummies for this variable.
        max_idxs = np.argmax(dummy_array, axis=1)
        # Map the argmax index back to the category.
        recovered_cats = [dummy_cols[idx].split(f"{col}_")[1] for idx in max_idxs]
        # Insert the reconstructed categorical column into the final DataFrame.
        X_final[col] = recovered_cats
        # Drop the dummy columns, since we now have the categorical column.
        X_final.drop(columns=dummy_cols, inplace=True)

# Reconstruct the data
df_SMOTE = pd.concat([X_final, y_resampled], axis=1)

df_SMOTE

Unnamed: 0,user_id,artist_id,track_id,play_count,age,registered_unixtime,country,gender
0,384,362,16567,16,35,1035849600,UK,m
1,384,2000,12303,9,35,1035849600,UK,m
2,384,2000,12308,40,35,1035849600,UK,m
3,384,2583,151308,15,35,1035849600,UK,m
4,384,2583,151314,43,35,1035849600,UK,m
...,...,...,...,...,...,...,...,...
530543,9111161,1503,26596,12,2,1201203477,AD,f
530544,49156366,10323,203026,13,26,1334186289,AD,f
530545,17836483,6311,340520,10,18,1239713723,SE,f
530546,27126360,1325,253759,8,18,1266458379,AD,f


In [None]:
print("SMOTE Gender Distribution:")
print(df_SMOTE['gender'].value_counts())

SMOTE Gender Distribution:
gender
m    265274
f    265274
Name: count, dtype: int64


In [None]:
# Save the debiased dataset
df_SMOTE.to_csv('Data/LFM-1b-DemoBiasSub-10k-SMOTE.csv', index=False)