In [1]:
import pandas as pd
import pickle
from collections import defaultdict
import numpy as np
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

In [2]:
rating_path = "../Data/Ciao/Dataset/rating.txt"

# Lecture ligne par ligne en splitant proprement
data = []
with open(rating_path, "r", encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split("::::")
        if len(parts) >= 4:
            try:
                user = parts[0]
                product = parts[1]
                rating = float(parts[3])  # On prend le 4e champ (index 3)
                time = parts[5]
                data.append((user, product, rating, time))
            except ValueError:
                continue  # Ignorer les lignes corrompues

# Création du DataFrame
ratings_df = pd.DataFrame(data, columns=["UserID", "ProductID", "Rating", "time"])
ratings_df = ratings_df[ratings_df["Rating"] <= 50]

In [3]:
# ---------------------------
# 2. Création des mappings : user id -> index, product id -> index
# ---------------------------
user_ids = ratings_df['UserID'].unique().tolist()
prod_ids = ratings_df['ProductID'].unique().tolist()

user2idx = {uid: idx for idx, uid in enumerate(user_ids)}
prod2idx = {pid: idx for idx, pid in enumerate(prod_ids)}

# Nombre d'utilisateurs et d'items
num_users = len(user_ids)
num_prods = len(prod_ids)
print(f"Nombre d'utilisateurs: {num_users}, Nombre de produits: {num_prods}")

Nombre d'utilisateurs: 10984, Nombre de produits: 112802


In [4]:
trustnetwork_path = "../Data/Ciao/Dataset/trustnetwork.txt"

user_social = []
# Lecture du fichier trustnetwork avec le séparateur "::::"
social_adj_lists = {}
with open(trustnetwork_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, total=145826):
        line = line.strip()
        if not line:
            continue  # ignorer les lignes vides
        # Séparer les utilisateurs avec "::::"
        users = line.split("::::")
        u, v, _ = users
        if u not in user_social:
            user_social.append(u)

  0%|          | 0/145826 [00:00<?, ?it/s]

In [5]:
# Filtrage si points dans le social mais pas dans le rating
user_to_drop = []
users_rating = np.unique(ratings_df["UserID"]).tolist()

for user in tqdm(users_rating, total=len(users_rating)):
    if user not in user_social:
        user_to_drop.append(user)

  0%|          | 0/10984 [00:00<?, ?it/s]

In [6]:
ratings_df = ratings_df[~ratings_df["UserID"].isin(user_to_drop)]

In [None]:
# trustnetwork_path = "../Data/Ciao/Dataset/trustnetwork.txt"

# # Lecture du fichier trustnetwork avec le séparateur "::::"
# social_adj_lists = {}
# with open(trustnetwork_path, "r", encoding="utf-8") as f:
#     for line in tqdm(f, total=145826):
#         line = line.strip()
#         if not line:
#             continue  # ignorer les lignes vides
#         # Séparer les utilisateurs avec "::::"
#         users = line.split("::::")
#         u, v, _ = users

#         if u not in user2idx or v not in user2idx:
#             continue

#         u_idx = user2idx[u]
#         v_idx = user2idx[v]

#         if u_idx not in social_adj_lists:
#             social_adj_lists[u_idx] = {v_idx}
#         else:
#             social_adj_lists[u_idx].add(v_idx)

In [7]:
# ---------------------------
# 5. Construction du graphe social (social_adj_lists)
# ---------------------------
# Chemin vers le fichier trustnetwork
trustnetwork_path = "../Data/Ciao/Dataset/trustnetwork.txt"

corrupted_users = []
# Lecture du fichier trustnetwork avec le séparateur "::::"
social_adj_lists = {}
with open(trustnetwork_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, total=145826):
        line = line.strip()
        if not line:
            continue  # ignorer les lignes vides
        # Séparer les utilisateurs avec "::::"
        users = line.split("::::")
        u, v, _ = users
        
        # Filtrage si dasn le social mais pas dans le rating 
        if u not in user2idx:
            continue

        if v not in user2idx:
            if u not in corrupted_users:
                corrupted_users.append(u)
            continue
        
        u_idx = user2idx[u]
        v_idx = user2idx[v]

        if u_idx not in social_adj_lists:
            social_adj_lists[u_idx] = {v_idx}
        else:
            social_adj_lists[u_idx].add(v_idx)

  0%|          | 0/145826 [00:00<?, ?it/s]

In [8]:
#Filtrage des points dans le social mais pas rating
for key in corrupted_users:
    if key not in social_adj_lists:
        user2idx.pop(key, None)  # or use del if you're sure key exists

In [9]:
ratings_df = ratings_df[~ratings_df["UserID"].isin(corrupted_users)]

In [10]:
# ---------------------------
# 6. Détermination des ratings_list
# ---------------------------

ratings_values = np.unique(ratings_df["Rating"]).tolist()
ratings_dict = {rating: idx for idx, rating in enumerate(ratings_values)}

print(f"ratings_dict : {ratings_dict}")

ratings_dict : {0.0: 0, 10.0: 1, 20.0: 2, 25.0: 3, 30.0: 4, 35.0: 5, 40.0: 6, 45.0: 7, 50.0: 8}


In [11]:
# Mapping des ratings
ratings_df["Rating"] = ratings_df["Rating"].map(ratings_dict)

In [12]:
len(user2idx)

6211

In [12]:
# ---------------------------
# 3. Création des historiques pour utilisateurs et items
# ---------------------------
# On va stocker pour chaque utilisateur la liste des produits et des ratings
history_u_lists = {}
history_ur_lists = {}
# Pour chaque produit, on stocke la liste des utilisateurs et des ratings
history_v_lists = {}
history_vr_lists = {}

# On trie le DataFrame par temps (pour pouvoir séparer train/test par utilisateur)
ratings_df = ratings_df.sort_values(by='time')

# Pour construire les historiques, on parcourt chaque ligne
for _, row in ratings_df.iterrows():
    u = user2idx[row['UserID']]
    v = prod2idx[row['ProductID']]
    r = row['Rating']
    
    # Ajout dans l'historique utilisateur
    if u not in history_u_lists:
        history_u_lists[u] = [v]
        history_ur_lists[u] = [r]
    else:
        history_u_lists[u].append(v)
        history_ur_lists[u].append(r)
    
    # Ajout dans l'historique produit
    if v not in history_v_lists:
        history_v_lists[v] = [u]
        history_vr_lists[v] = [r]
    else:
        history_v_lists[v].append(u)
        history_vr_lists[v].append(r)

In [13]:
# ---------------------------
# 4. Séparation train/test
# ---------------------------
# Pour chaque utilisateur, on va prendre la dernière interaction pour le test
train_u, train_v, train_r = [], [], []
test_u, test_v, test_r = [], [], []

# Regroupons par utilisateur (à partir du DataFrame trié)
grouped = ratings_df.groupby('UserID')
for uid, group in grouped:
    group = group.sort_values(by='time')
    u_idx = user2idx[uid]
    interactions = group[['ProductID', 'Rating']].values  # chaque ligne: [product, rating]
    
    if len(interactions) == 0:
        continue
    # Le dernier devient test
    for prod, rating in interactions[:-1]:
        train_u.append(u_idx)
        train_v.append(prod2idx[prod])
        train_r.append(rating)
        
    # Le dernier pour test
    prod, rating = interactions[-1]
    test_u.append(u_idx)
    test_v.append(prod2idx[prod])
    test_r.append(rating)

print(f"Nombre d'interactions train: {len(train_u)}, test: {len(test_u)}")

Nombre d'interactions train: 74600, test: 6178


In [32]:
6131 not in history_u_lists

True

In [29]:
for t in np.unique(train_u).tolist():
    if t not in history_u_lists:
        print("error")
        break

In [27]:
for uid, group in grouped:
    group = group.sort_values(by='time')
    interactions = group[['ProductID', 'Rating']].values
    print(uid)
    print(group)

10269
       UserID                                       ProductID  Rating  \
254632  10269                                   lordofthe.net       6   
254633  10269                    Tomb Raider: Chronicles (PC)       1   
254626  10269                               The Goonies (DVD)       6   
254621  10269                                    Ciao Surveys       8   
254649  10269                          Command & Conquer (PC)       8   
254650  10269                   Half-Life: Counterstrike (PC)       6   
254648  10269                     AD&D Rulebook - 3rd Edition       6   
254647  10269                           Monstrous Manual MCC1       6   
254646  10269                                   Kill Dr Lucky       8   
254645  10269                              Cardiff University       8   
254644  10269                           The Sixth Sense (DVD)       2   
254643  10269      Tom Clancy's Rainbow Six: Rogue Spear (PC)       6   
254642  10269                                

In [16]:
# # ---------------------------
# # 5. Construction du graphe social (social_adj_lists)
# # ---------------------------
# # Chemin vers le fichier trustnetwork
# trustnetwork_path = "../Data/Ciao/Dataset/trustnetwork.txt"

# # Lecture du fichier trustnetwork avec le séparateur "::::"
# social_adj_lists = {}
# with open(trustnetwork_path, "r", encoding="utf-8") as f:
#     for line in tqdm(f, total=145826):
#         line = line.strip()
#         if not line:
#             continue  # ignorer les lignes vides
#         # Séparer les utilisateurs avec "::::"
#         users = line.split("::::")
#         u, v, _ = users
        
#         if u not in user2idx or v not in user2idx:
#              continue
        
#         u_idx = user2idx[u]
#         v_idx = user2idx[v]

#         if u_idx not in social_adj_lists:
#             social_adj_lists[u_idx] = {v_idx}
#         else:
#             social_adj_lists[u_idx].add(v_idx)

In [14]:
# Ordering dict
history_v_lists = dict(sorted(history_v_lists.items()))
history_vr_lists = dict(sorted(history_vr_lists.items()))

In [None]:
# ---------------------------
# 7. Création du dictionnaire final et sauvegarde
# ---------------------------
data_dict = {
    "history_u_lists": history_u_lists,
    "history_ur_lists": history_ur_lists,
    "history_v_lists": history_v_lists,
    "history_vr_lists": history_vr_lists,
    "train_u": train_u,
    "train_v": train_v,
    "train_r": train_r,
    "test_u": test_u,
    "test_v": test_v,
    "test_r": test_r,
    "social_adj_lists": social_adj_lists,
    "ratings_list": ratings_dict
}

# Sauvegarde en pickle
with open("../Data/Ciao/Dataset/processed_data.pkl", "wb") as f:
    pickle.dump(data_dict, f)

print("Transformation terminée. Les données ont été enregistrées dans 'processed_data.pkl'.")


Transformation terminée. Les données ont été enregistrées dans 'processed_data.pkl'.


In [16]:
keys = [k for k, v in user2idx.items() if v == 6715]
print(keys)

['5030783']


In [18]:
'5030783' in corrupted_users

False

In [17]:
social_adj_lists[6715]

KeyError: 6715