In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
columns=['user_id','country','age','sex','track_name','artist','timestamp']
column_types={'user_id':np.int32,'country':str,'age':np.int32,'sex':str,'track_name':str,'artist':str}

In [None]:
df = pd.read_table('/data/LFM-2b.tsv',nrows=150000000, header=None,names=['user_id','country','age','sex','track_name','artist','timestamp'],dtype=column_types,parse_dates = ['timestamp'],on_bad_lines='skip')

In [None]:
df = (
    df.groupby(["user_id", "track_name", "artist"], as_index=False)
    .agg({"timestamp": "count", "country": "first", "sex": "first", "age": "first"})
    .rename(columns={"timestamp": "interactions"})
)
# Define a list of valid countries, valid sexes, and a valid age range
valid_sexes = ["m", "f","n"]  # Replace with your valid sexes
valid_age_range = (0, 80)  # Replace with your valid age range
valid_rows = df["sex"].isin(valid_sexes) & df["age"].between(
    valid_age_range[0], valid_age_range[1]
)
df = df[valid_rows]
df["country"].replace("", np.nan, inplace=True)
df.dropna(subset=["country"], inplace=True)
# dropping rows with interactions < 10 and songs with interactions < 100
track_artist_counts = df.groupby(["track_name", "artist"])["interactions"].transform(
    "sum"
)
# Filter rows with track names and artists having < 100 total interactions
df = df[track_artist_counts >= 100]

# Filter user IDs with < 10 total interactions
user_id_counts = df.groupby("user_id")["interactions"].transform("sum")
df = df[user_id_counts >= 10]
df = df.drop_duplicates(
    subset=["user_id", "track_name", "artist", "country", "age", "sex"]
)

# get user attributes from the data

user_data = df[["user_id", "age", "sex", "country"]].drop_duplicates().set_index('user_id')
# Create a LabelEncoder for "sex" and "country"
sex_encoder = LabelEncoder()
country_encoder = LabelEncoder()

# Encode the "sex" and "country" columns
user_data['sex_encoded'] = sex_encoder.fit_transform(user_data['sex'])
# Step 1: Count occurrences of each country
country_counts = user_data['country'].value_counts()

# Step 2: Filter countries with occurrences greater than 500
selected_countries = country_counts[country_counts > 500].index.tolist()

# Step 3: Replace countries with fewer occurrences with 'Others'
user_data['country'] = user_data['country'].apply(lambda x: x if x in selected_countries else 'OTHER')

user_data['country_encoded'] = country_encoder.fit_transform(user_data['country'])
user_data.to_csv('users.csv')


df_grouped = df.groupby(["track_name", "artist"])["interactions"].sum().reset_index()

# Create a new column "suid" using the index as a unique identifier
df_grouped["song_id"] = df_grouped.index

# Select only the desired columns
result_df = df_grouped[["track_name", "artist", "song_id"]]
result_df.to_csv("items.csv",index=False)


# Merge the original DataFrame with the result_df on "track_name" and "artist" to get "suid"
merged_df = df.merge(result_df, on=["track_name", "artist"])

# Pivot the DataFrame
pivot_df = merged_df.pivot(index='user_id', columns='song_id', values='interactions')

# Fill missing values with 0
pivot_df = pivot_df.fillna(0)


# remove all empty rows HERE*************************************************************************************************************
pivot_df=pivot_df.loc[~(pivot_df==0).all(axis=1)]
# Split the DataFrame into train (80%), test (10%), and validation (10%) sets
train_df, temp_df = train_test_split(pivot_df, test_size=0.2, random_state=42)
train_df.to_csv("user_interactions_train.csv")

# 50% validation and 50% test split
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
# Create two new DataFrames with the same structure as the original
valid_df_tr = valid_df.copy()
valid_df_te = valid_df.copy()


# Randomly decide whether to assign the value to df1 or df2
assignment = np.random.choice([1, 2], size=valid_df.shape)

# Assign the values accordingly
valid_df_tr *= (assignment == 1)
valid_df_te *= (assignment == 2)
valid_df_te.loc[:,:] = (valid_df_te > 0).astype(float)

# Convert the assigned values to integers
valid_df_tr = valid_df_tr.astype(float)


valid_df_tr.to_csv("user_interactions_validation_tr.csv")
valid_df_te.to_csv("user_interactions_validation_te.csv")


# Create two new DataFrames with the same structure as the original
test_df_tr = test_df.copy()
test_df_te = test_df.copy()


# Randomly decide whether to assign the value to df1 or df2
assignment = np.random.choice([1, 2], size=test_df.shape)

# Assign the values accordingly
test_df_tr *= (assignment == 1)
test_df_te *= (assignment == 2)
test_df_te.loc[:,:] = (test_df_te > 0).astype(float)
# Convert the assigned values to integers
test_df_tr = test_df_tr.astype(float)



test_df_tr.to_csv("user_interactions_test_tr.csv")
test_df_te.to_csv("user_interactions_test_te.csv")
