In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
from sklearn.metrics import roc_auc_score

from utils import *
from models import *
from tqdm import tqdm

import scipy.stats as stats

import pdb
import sys

# Exploring the Electrinics Dataframe

In [2]:
df = pd.read_csv('data/marketBias/data/df_electronics_original.csv') 

In [3]:
# Display the DataFrame's info
print(df.info())

# Display statistical summary of numerical columns
print(df.describe())

# Display data types of each column
print(df.dtypes)

# For a specific column, show how often each value occurs
print(df['user_attr'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1292954 entries, 0 to 1292953
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   item_id     1292954 non-null  int64  
 1   user_id     1292954 non-null  int64  
 2   rating      1292954 non-null  float64
 3   timestamp   1292954 non-null  object 
 4   model_attr  1292954 non-null  object 
 5   category    1292954 non-null  object 
 6   brand       331120 non-null   object 
 7   year        1292954 non-null  int64  
 8   user_attr   174124 non-null   object 
 9   split       1292954 non-null  int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 98.6+ MB
None
            item_id       user_id        rating          year         split
count  1.292954e+06  1.292954e+06  1.292954e+06  1.292954e+06  1.292954e+06
mean   4.183588e+03  5.605127e+05  4.051482e+00  2.012938e+03  1.747587e-01
std    2.525346e+03  3.342374e+05  1.379732e+00  2.643513e+00  5.506810e-01

In [4]:
print(df['model_attr'].value_counts())

Female         481171
Male           429715
Female&Male    382068
Name: model_attr, dtype: int64


# Electronics Dataframe

In [5]:
import pandas as pd
import numpy as np
import pickle

# Load the CSV file
df_electronics = pd.read_csv('data/marketBias/data/df_electronics.csv') 

Electronics is another review dataset collected from the Electronics
category on Amazon with Clothing as an auxiliary category. This
dataset is built on top of the public Amazon 2018 Dataset [24] and
further processed to facilitate the research goals in this paper. We
regard the gender as the target marketing bias on this dataset.

In [6]:
# Filter out rows where 'user_attr' is NaN
df_electronics = df_electronics.dropna(subset=['user_attr'])

In [7]:
print(df_electronics['model_attr'].value_counts())

Female         65846
Male           56870
Female&Male    51408
Name: model_attr, dtype: int64


In [8]:
# Rename columns to match BPRTrainLoader expectations
df_electronics.rename(columns={'user_id': 'userid', 'item_id': 'itemid', 'rating': 'rating', 'user_attr': 'gender'}, inplace=True)

In [9]:
#Change rating to integers
df_electronics['rating'] = df_electronics['rating'].astype(int)

In [10]:
# Factorize User IDs
# Convert 'userid' to range from 0 to n_users-1
user_codes, unique_users = pd.factorize(df_electronics['userid'])

# Apply the factorized user IDs back to the dataframe
df_electronics['userid'] = user_codes

# Factorize Item IDs
# Convert 'itemid' to range from 0 to n_items-1
item_codes, unique_items = pd.factorize(df_electronics['itemid'])

# Apply the factorized item IDs back to the dataframe
df_electronics['itemid'] = item_codes

In [11]:
# Get number of users and number of items
n_users = df_electronics['userid'].nunique()
n_items = df_electronics['itemid'].nunique()

In [12]:
# Check userid and itemid
print(df_electronics['userid'].min(), df_electronics['userid'].max())
print(df_electronics['itemid'].min(), df_electronics['itemid'].max())

0 132392
0 8187


In [13]:
# Ensure the DataFrame is sorted by 'userid' to maintain order
df_electronics = df_electronics.sort_values(by='userid').reset_index(drop=True)

# Convert 'gender' column to a categorical type if it's not numeric
if df_electronics['gender'].dtype == 'object':
    df_electronics['gender'], _ = pd.factorize(df_electronics['gender'])

In [14]:
counts = df_electronics['gender'].value_counts()
print(counts)

1    87424
0    86700
Name: gender, dtype: int64


In [15]:
df_electronics.head()

Unnamed: 0,itemid,userid,rating,timestamp,model_attr,category,brand,year,gender,split
0,0,0,2,1999-12-01,Female,Portable Audio & Video,,1999,0,0
1,323,0,4,2008-01-05,Female,Camera & Photo,Gary Fong,2008,0,0
2,457,0,5,2008-10-30,Female,Camera & Photo,,2008,0,0
3,1218,0,5,2012-01-10,Female,Portable Audio & Video,,2011,0,0
4,1140,0,3,2012-03-15,Female,Camera & Photo,Kodak,2011,0,0


In [16]:
# Create item side features
original_item_side_features = df_electronics[['itemid', 'model_attr', 'category', 'brand']].drop_duplicates()

# Optionally, you might want to reset the index if you plan to work with this DataFrame directly
original_item_side_features.reset_index(drop=True, inplace=True)


In [17]:
original_item_side_features.head()

Unnamed: 0,itemid,model_attr,category,brand
0,0,Female,Portable Audio & Video,
1,323,Female,Camera & Photo,Gary Fong
2,457,Female,Camera & Photo,
3,1218,Female,Portable Audio & Video,
4,1140,Female,Camera & Photo,Kodak


In [18]:
# Create user side features
original_user_side_features = df_electronics[['userid', 'gender']].drop_duplicates()

# Optionally, you might want to reset the index if you plan to work with this DataFrame directly
#original_user_side_features.reset_index(drop=True, inplace=True)


In [19]:
# Ensure the DataFrame is sorted by 'userid' to maintain order
original_user_side_features = original_user_side_features.sort_values(by='userid').reset_index(drop=True)

# Convert 'gender' column to a categorical type if it's not numeric
if original_user_side_features['gender'].dtype == 'object':
    original_user_side_features['gender'], _ = pd.factorize(original_user_side_features['gender'])

In [20]:
original_user_side_features.head()

Unnamed: 0,userid,gender
0,0,0
1,1,1
2,2,1
3,3,0
4,4,1


In [21]:
counts = original_user_side_features['gender'].value_counts()
print(counts)

0    71043
1    61350
Name: gender, dtype: int64


In [22]:
# Create a dictionary of user side features (if needed in this format)
user_side_features = {
    'userid': original_user_side_features['userid'].values,
    'gender': original_user_side_features['gender'].values,
}

In [23]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
train_df, test_df = train_test_split(df_electronics, test_size=0.2, random_state=42)

In [24]:
# Initialize train_u2i and train_i2u as defaultdicts
from collections import defaultdict

train_u2i = defaultdict(list)
train_i2u = defaultdict(list)
test_u2i = defaultdict(list)
test_i2u = defaultdict(list)

# Populate train_u2i and train_i2u mappings (as before)
for row in train_df.itertuples(index=False):
    train_u2i[row.userid].append(row.itemid)
    train_i2u[row.itemid].append(row.userid)

for row in test_df.itertuples(index=False):
    test_u2i[row.userid].append(row.itemid)
    test_i2u[row.itemid].append(row.userid)


In [25]:
# Convert train_df and test_df to the desired format before pickling
train_set = {
    'userid': train_df['userid'].values,
    'itemid': train_df['itemid'].values,
    'rating': train_df['rating'].values
}

test_set = {
    'userid': test_df['userid'].values,
    'itemid': test_df['itemid'].values,
    'rating': test_df['rating'].values
}

In [26]:
type(train_set['rating'][0])

numpy.int64

In [27]:
import pickle

data_to_pickle = {
    'train_u2i': dict(train_u2i),
    'test_u2i': dict(test_u2i),
    'train_i2u': dict(train_i2u),
    'test_i2u': dict(test_i2u),   
    'train_set': train_df,
    'test_set': test_df,
    'user_side_features': user_side_features,
    'n_users': df_electronics['userid'].nunique(),
    'n_items': n_items
}

output_path = 'data/marketBias/process/process.pkl' 
with open(output_path, 'wb') as f:
    pickle.dump(data_to_pickle, f)

print(f"Data has been processed and saved to {output_path}")

Data has been processed and saved to data/marketBias/process/process.pkl


In [28]:
# Serialize the objects into a .pkl file
output_path = 'data/marketBias/process/process.pkl' 
with open(output_path, 'wb') as f:
    pickle.dump(train_u2i, f)
    pickle.dump(train_i2u, f)
    pickle.dump(test_u2i, f)
    pickle.dump(test_i2u, f)
    pickle.dump(train_set, f)
    pickle.dump(test_set, f)
    pickle.dump(user_side_features, f)
    pickle.dump((n_users, n_items), f)

print(f"Data has been processed and saved to {output_path}")


Data has been processed and saved to data/marketBias/process/process.pkl


In [29]:
##### load dataset
with open('./data/marketBias/process/process.pkl', 'rb') as f:
    train_u2i = pickle.load(f)
    train_i2u = pickle.load(f)
    test_u2i = pickle.load(f)
    test_i2u = pickle.load(f)
    train_set = pickle.load(f)
    test_set = pickle.load(f)
    user_side_features = pickle.load(f)
    n_users, n_items = pickle.load(f)

In [30]:
print(len(train_u2i))
print(len(train_i2u))

print(type(train_set))
print(train_set['userid'].shape[0])
print(f"Number of Users {n_users}")
print(f"Number of Items {n_items}")

# Number of Interactions
# Calculating the number of interactions in the training set
num_interactions_train = sum(len(items) for items in train_u2i.values())
# Calculating the number of interactions in the testing set
num_interactions_test = sum(len(items) for items in test_u2i.values())
# Total interactions is the sum of interactions in both sets
total_interactions = num_interactions_train + num_interactions_test

print(f"Number of interactions: {total_interactions}")

# Density
# Density is calculated as the ratio of total interactions to the product of number of users and items
density = total_interactions / (n_users * n_items)

print(f"Density {density}")
print(f"Density percentage {density * 100}")


for feature_name in user_side_features.keys():
    print(feature_name)



110587
7840
<class 'dict'>
139299
Number of Users 132393
Number of Items 8188
Number of interactions: 174124
Density 0.0001606259754146209
Density percentage 0.01606259754146209
userid
gender


In [None]:
print(type(user_side_features['gender']))
print(np.unique(user_side_features['gender'], return_counts=True))

In [None]:
print(train_set)

In [None]:
##### load dataset
with open('./data/ml-1m/process/process.pkl', 'rb') as f:
    MLtrain_u2i = pickle.load(f)
    MLtrain_i2u = pickle.load(f)
    MLtest_u2i = pickle.load(f)
    MLtest_i2u = pickle.load(f)
    MLtrain_set = pickle.load(f)
    MLtest_set = pickle.load(f)
    MLuser_side_features = pickle.load(f)
    MLn_users, MLn_items = pickle.load(f)

In [None]:
type(MLuser_side_features)

In [None]:
print(MLtrain_set)


In [None]:
print(type(MLtrain_u2i))
print(type(MLtrain_i2u))
print(type(MLtest_u2i))
print(type(MLtest_i2u))
print(type(MLtrain_set))
print(type(MLtest_set))
print(type(MLuser_side_features))
print(type(MLn_users))
print(type(MLn_items))

In [None]:
type(MLtrain_set['rating'][0])

In [None]:
type(train_set['rating'][0])


In [None]:
import numpy as np

def count_nan_in_dict(d):
    count = 0
    for key, value in d.items():
        if isinstance(value, dict):
            # If the value is a dictionary, recurse
            count += count_nan_in_dict(value)
        elif isinstance(value, (list, tuple, np.ndarray)):
            # If the value is a list, tuple, or numpy array, iterate through it
            count += sum(1 for item in value if item is np.nan or item != item)
        else:
            # Check if the value itself is NaN
            if value is np.nan or value != value:
                count += 1
    return count

nan_count = count_nan_in_dict(train_set)
print(f"Number of NaN values in the dictionary: {nan_count}")

In [None]:
print(MLtrain_u2i[0])
print(MLtrain_i2u[0])
print(MLtest_u2i[0])
print(MLtest_i2u[0])
print(MLtrain_set)
print(MLtest_set)
print(MLuser_side_features)
print(MLn_users)
print(MLn_items)

In [None]:
print(MLtrain_set["itemid"].max())
print(MLtrain_set["userid"].max())

In [None]:
MLtest_set["itemid"].max()

In [None]:
for feature_name in MLuser_side_features.keys():
    print(feature_name)

In [None]:
MLuser_side_features['userid']


In [None]:
count = 0
for key, value in MLtrain_u2i.items():
    if count < 10:
        print(f"key: {key}", f"value: {value}")
        count += 1
    else:
        
        if key >= 6038:
            print(f"key: {key}", f"value: {value}")


In [None]:
print(type(train_u2i))
print(type(train_i2u))
print(type(test_u2i))
print(type(test_i2u))
print(type(train_set))
print(type(test_set))
print(type(user_side_features))
print(type(n_users))
print(type(n_items))

In [None]:
print("train_u2i")
count = 0
for key, value in train_u2i.items():
    if count < 10:
        print(key, value)
        count += 1
    else:
        break

print("train_i2u")
count = 0
for key, value in train_i2u.items():
    if count < 10:
        print(key, value)
        count += 1
    else:
        break

print("test_u2i")
count = 0
for key, value in test_u2i.items():
    if count < 10:
        print(key, value)
        count += 1
    else:
        break
    
print("test_i2u")
count = 0
for key, value in test_i2u.items():
    if count < 10:
        print(key, value)
        count += 1
    else:
        break

print(train_set)
print(test_set)
print(user_side_features)
print(n_users)
print(n_items)