In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
columns=['user_id','country','age','sex','track_name','artist','timestamp']
column_types={'user_id':np.int32,'country':str,'age':np.int32,'sex':str,'track_name':str,'artist':str}

In [3]:
df = pd.read_table('/data/LFM-2b.tsv',nrows=50000000, header=None,names=['user_id','country','age','sex','track_name','artist','timestamp'],dtype=column_types,parse_dates = ['timestamp'],on_bad_lines='skip')

In [4]:
df = (
    df.groupby(["user_id", "track_name", "artist"], as_index=False)
    .agg({"timestamp": "count", "country": "first", "sex": "first", "age": "first"})
    .rename(columns={"timestamp": "interactions"})
)
# Define a list of valid countries, valid sexes, and a valid age range
valid_sexes = ["m", "f","n"]  # Replace with your valid sexes
valid_age_range = (0, 80)  # Replace with your valid age range
valid_rows = df["sex"].isin(valid_sexes) & df["age"].between(
    valid_age_range[0], valid_age_range[1]
)
df = df[valid_rows]
df["country"].replace("", np.nan, inplace=True)
df.dropna(subset=["country"], inplace=True)
# dropping rows with interactions < 10 and songs with interactions < 100
track_artist_counts = df.groupby(["track_name", "artist"])["interactions"].transform(
    "sum"
)
# Filter rows with track names and artists having < 100 total interactions
df = df[track_artist_counts >= 100]

# Filter user IDs with < 10 total interactions
user_id_counts = df.groupby("user_id")["interactions"].transform("sum")
df = df[user_id_counts >= 10]
df = df.drop_duplicates(
    subset=["user_id", "track_name", "artist", "country", "age", "sex"]
)

# get user attributes from the data

user_data = df[["user_id", "age", "sex", "country"]].drop_duplicates().set_index('user_id')
# Create a LabelEncoder for "sex" and "country"
sex_encoder = LabelEncoder()
country_encoder = LabelEncoder()

# Encode the "sex" and "country" columns
user_data['sex_encoded'] = sex_encoder.fit_transform(user_data['sex'])
user_data['country_encoded'] = country_encoder.fit_transform(user_data['country'])
user_data.to_csv('users.csv')


df_grouped = df.groupby(["track_name", "artist"])["interactions"].sum().reset_index()

# Create a new column "suid" using the index as a unique identifier
df_grouped["song_id"] = df_grouped.index

# Select only the desired columns
result_df = df_grouped[["track_name", "artist", "song_id"]]
result_df.to_csv("items.csv",index=False)


# Merge the original DataFrame with the result_df on "track_name" and "artist" to get "suid"
merged_df = df.merge(result_df, on=["track_name", "artist"])

# Pivot the DataFrame
pivot_df = merged_df.pivot(index='user_id', columns='song_id', values='interactions')

# Fill missing values with 0
pivot_df = pivot_df.fillna(0)
def interactions_to_relevency(value):
    if value > 10:
        return 1
    else:
        return 0

pivot_df=pivot_df.applymap(interactions_to_relevency)

# remove all empty rows HERE*************************************************************************************************************
pivot_df=pivot_df.loc[~(pivot_df==0).all(axis=1)]
# Split the DataFrame into train (80%), test (10%), and validation (10%) sets
train_df, temp_df = train_test_split(pivot_df, test_size=0.2, random_state=42)
train_df.to_csv("user_interactions_train.csv")

# 50% validation and 50% test split
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# 50% validation train and 50% validation test split
valid_df_tr,valid_df_te =train_test_split(valid_df, test_size=0.5, random_state=42)

valid_df_tr.to_csv("user_interactions_validation_tr.csv")
valid_df_te.to_csv("user_interactions_validation_te.csv")

# 50% test train and 50% test test split
test_df_tr,test_df_te =train_test_split(test_df, test_size=0.5, random_state=42)
test_df_tr.to_csv("user_interactions_test_tr.csv")
test_df_te.to_csv("user_interactions_test_te.csv")


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  pivot_df=pivot_df.applymap(interactions_to_relevency)


In [None]:
temp_df = pd.read_csv('./users.csv')

In [6]:
train_df

song_id,0,1,2,3,4,5,6,7,8,9,...,45749,45750,45751,45752,45753,45754,45755,45756,45757,45758
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
440,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2903,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4508,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3273,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2211,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2282,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2631,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
train_df

song_id,0,1,2,3,4,5,6,7,8,9,...,45749,45750,45751,45752,45753,45754,45755,45756,45757,45758
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3431,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3787,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2043,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
