In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

import joblib
small_matrix_with_metadata = joblib.load('small_matrix_with_metadata.joblib')
print(small_matrix_with_metadata)

         user_id  video_id  play_duration  video_duration  \
0             14       148           4381            6067   
1             14       183          11635            6100   
2             14      3649          22422           10867   
3             14      5262           4479            7908   
4             14      8234           4602           11000   
...          ...       ...            ...             ...   
4269394     7162      9177           5315           37205   
4269395     7162      4987          10085            8167   
4269396     7162      7988          50523           49319   
4269397     7162      6533           2190            8000   
4269398     7162      6523          11909            7255   

                            time  watch_ratio    categories  category_count  \
0        2020-07-05 05:27:48.378     0.722103  [11, 28, 19]               3   
1        2020-07-05 05:28:00.057     1.907377          [28]               1   
2        2020-07-05 05:29:09.4

In [2]:
# Statistiques d'engagement par utilisateur
user_features = small_matrix_with_metadata.groupby('user_id').agg(
    video_count=('video_id', 'count'),
    avg_watch_ratio=('watch_ratio', 'mean'),
    total_play_time=('play_duration', 'sum'),
    complete_views=('watch_ratio', lambda x: sum(x >= 1)),
    avg_play_progress=('avg_play_progress', 'mean')
).reset_index()

# Préférences de catégories par utilisateur
# D'abord, exploser la liste des catégories
user_categories = small_matrix_with_metadata.explode('categories')
# Ensuite, compter les occurrences de chaque catégorie par utilisateur
category_preferences = user_categories.groupby(['user_id', 'categories']).size().reset_index(name='category_count')
# Pivoter pour créer des colonnes par catégorie
category_matrix = category_preferences.pivot_table(
    index='user_id', 
    columns='categories', 
    values='category_count', 
    fill_value=0
)
# Renommer les colonnes
print(category_matrix)

category_matrix.columns = [f'category_{int(col)}' for col in category_matrix.columns]
# Fusionner avec les caractéristiques utilisateur
user_features = user_features.merge(category_matrix.reset_index(), on='user_id', how='left')
print(user_features)

categories    0      1     2    3     4     5      6      7      8      9   \
user_id                                                                      
14           9.0  156.0  28.0  4.0  36.0  96.0  407.0  199.0  490.0  314.0   
19          10.0  161.0  28.0  5.0  36.0  98.0  396.0  203.0  486.0  313.0   
21          11.0  162.0  28.0  7.0  37.0  96.0  392.0  204.0  486.0  306.0   
23          10.0  147.0  24.0  3.0  34.0  84.0  329.0  198.0  423.0  287.0   
24          11.0  141.0  24.0  5.0  32.0  83.0  356.0  186.0  410.0  284.0   
...          ...    ...   ...  ...   ...   ...    ...    ...    ...    ...   
7142         9.0  155.0  27.0  6.0  33.0  93.0  401.0  201.0  485.0  311.0   
7147         9.0  161.0  30.0  4.0  35.0  98.0  396.0  205.0  494.0  299.0   
7153        10.0  157.0  28.0  4.0  36.0  96.0  398.0  200.0  486.0  309.0   
7159         7.0  153.0  27.0  5.0  33.0  94.0  396.0  198.0  476.0  314.0   
7162        10.0  147.0  26.0  6.0  34.0  94.0  390.0  187.0  48

In [3]:
# Caractéristiques de popularité des vidéos
video_features = small_matrix_with_metadata.groupby('video_id').agg(
    view_count=('user_id', 'count'),
    avg_watch_ratio=('watch_ratio', 'mean'),
    complete_view_ratio=('watch_ratio', lambda x: sum(x >= 1) / len(x)),
    like_engagement=('total_likes', 'first'),  # Prendre la première valeur car elle est constante par vidéo
    comment_engagement=('total_comments', 'first'),
    share_engagement=('total_shares', 'first'),
    daily_popularity=('avg_daily_likes', 'first')
).reset_index()

# Normaliser les caractéristiques de popularité
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
popularity_features = ['view_count', 'like_engagement', 'comment_engagement', 'share_engagement','daily_popularity']
video_features[popularity_features] = scaler.fit_transform(video_features[popularity_features])
print(video_features)

      video_id  view_count  avg_watch_ratio  complete_view_ratio  \
0          103    0.862617         0.804232             0.211221   
1          109    0.937093         1.064075             0.473004   
2          120    0.954447         1.364567             0.684839   
3          122    0.895155         0.862472             0.253779   
4          128    0.947939         0.796798             0.188722   
...        ...         ...              ...                  ...   
3322     10506    0.955893         1.238260             0.573453   
3323     10519    0.963847         1.522804             0.784024   
3324     10552    0.890817         0.333647             0.025580   
3325     10589    0.953001         1.089004             0.379955   
3326     10595    0.952278         1.334487             0.665419   

      like_engagement  comment_engagement  share_engagement  daily_popularity  
0            0.026932            0.000902          0.009634          0.002985  
1            0.010807  

In [4]:
# Créer des caractéristiques d'interaction pour chaque paire utilisateur-vidéo
interactions_with_features = small_matrix_with_metadata.merge(
    user_features, on='user_id', how='left', suffixes=('', '_user')
).merge(
    video_features, on='video_id', how='left', suffixes=('', '_video')
)

# Créer des caractéristiques d'interaction personnalisées
interactions_with_features['user_video_affinity'] = (
    interactions_with_features['watch_ratio'] / interactions_with_features['avg_watch_ratio']
)
print(interactions_with_features)

         user_id  video_id  play_duration  video_duration  \
0             14       148           4381            6067   
1             14       183          11635            6100   
2             14      3649          22422           10867   
3             14      5262           4479            7908   
4             14      8234           4602           11000   
...          ...       ...            ...             ...   
4269394     7162      9177           5315           37205   
4269395     7162      4987          10085            8167   
4269396     7162      7988          50523           49319   
4269397     7162      6533           2190            8000   
4269398     7162      6523          11909            7255   

                            time  watch_ratio    categories  category_count  \
0        2020-07-05 05:27:48.378     0.722103  [11, 28, 19]               3   
1        2020-07-05 05:28:00.057     1.907377          [28]               1   
2        2020-07-05 05:29:09.4

In [5]:
# Construire la matrice d'interaction utilisateur-vidéo
from scipy.sparse import csr_matrix

# Créer une matrice pivot utilisateur-vidéo avec le watch_ratio comme valeur
user_item_matrix = small_matrix_with_metadata.pivot_table(
    index='user_id', 
    columns='video_id', 
    values='watch_ratio',
    fill_value=0
)

# Convertir en matrice sparse pour gérer efficacement les grands ensembles de données
user_item_sparse = csr_matrix(user_item_matrix.values)

# Sauvegarder les mappings d'index pour la conversion future
user_index = {user: i for i, user in enumerate(user_item_matrix.index)}
item_index = {item: i for i, item in enumerate(user_item_matrix.columns)}

In [6]:
print(user_item_sparse)
print(user_index)
print(item_index)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4269399 stored elements and shape (1411, 3327)>
  Coords	Values
  (0, 0)	0.429126213592233
  (0, 1)	1.482039397450753
  (0, 2)	0.7287378330939843
  (0, 3)	0.4778097390589685
  (0, 4)	0.4393329499712478
  (0, 5)	1.1502445842068485
  (0, 6)	1.8163166004379208
  (0, 7)	0.7811217510259918
  (0, 8)	1.5543964232488825
  (0, 9)	2.3074857043839887
  (0, 10)	0.4628717948717948
  (0, 11)	1.2699900738943422
  (0, 12)	1.2418279569892474
  (0, 13)	1.2871640459462474
  (0, 14)	0.3054767015163054
  (0, 15)	0.7221031811438932
  (0, 16)	1.0677795908121
  (0, 18)	1.883699402507784
  (0, 19)	2.6829807388695928
  (0, 20)	0.7048090304623473
  (0, 21)	2.00125
  (0, 22)	0.9491173416407062
  (0, 23)	0.8829732773625474
  (0, 24)	0.8064637354378054
  (0, 25)	1.6327391221008245
  :	:
  (1410, 3299)	1.43193849256365
  (1410, 3300)	0.5029257123473542
  (1410, 3301)	1.460816611760008
  (1410, 3302)	1.623469982514086
  (1410, 3303)	0.8524003720397796
  (1

In [7]:
# Caractéristiques temporelles
if 'time' in small_matrix_with_metadata.columns:
    # Convertir en datetime si ce n'est pas déjà fait
    small_matrix_with_metadata['datetime'] = pd.to_datetime(small_matrix_with_metadata['time'])
    
    # Extraire des composantes temporelles
    small_matrix_with_metadata['hour'] = small_matrix_with_metadata['datetime'].dt.hour
    small_matrix_with_metadata['day_of_week'] = small_matrix_with_metadata['datetime'].dt.dayofweek
    small_matrix_with_metadata['is_weekend'] = small_matrix_with_metadata['day_of_week'].isin([5, 6]).astype(int)
    
    # Créer des caractéristiques temporelles agrégées par utilisateur
    time_features = small_matrix_with_metadata.groupby('user_id').agg(
        morning_ratio=('hour', lambda x: sum((x >= 6) & (x < 12)) / len(x)),
        afternoon_ratio=('hour', lambda x: sum((x >= 12) & (x < 18)) / len(x)),
        evening_ratio=('hour', lambda x: sum((x >= 18) & (x < 22)) / len(x)),
        night_ratio=('hour', lambda x: sum((x >= 22) | (x < 6)) / len(x)),
        weekend_ratio=('is_weekend', 'mean')
    ).reset_index()
    
    # Fusionner avec les caractéristiques utilisateur
    user_features = user_features.merge(time_features, on='user_id', how='left')

# Caractéristiques basées sur la tendance (popularité récente vs ancienne)
if 'date' in small_matrix_with_metadata.columns:
    small_matrix_with_metadata['date'] = pd.to_datetime(small_matrix_with_metadata['date'])
    
    # Déterminer la date médiane pour séparer "ancien" vs "récent"
    median_date = small_matrix_with_metadata['date'].median()
    
    # Créer des caractéristiques de popularité récente vs ancienne
    small_matrix_with_metadata['is_recent'] = (small_matrix_with_metadata['date'] > median_date).astype(int)
    
    # Calculer la popularité récente vs ancienne par vidéo
    popularity_trend = small_matrix_with_metadata.groupby(['video_id', 'is_recent']).size().unstack(fill_value=0)
    popularity_trend.columns = ['old_popularity', 'recent_popularity']
    popularity_trend['popularity_trend'] = popularity_trend['recent_popularity'] / (popularity_trend['old_popularity'] + 1)
    
    # Fusionner avec les caractéristiques vidéo
    video_features = video_features.merge(popularity_trend.reset_index(), on='video_id', how='left')

In [8]:
print(user_features)

print(video_features)

      user_id  video_count  avg_watch_ratio  total_play_time  complete_views  \
0          14         3130         1.038889         30588221            1282   
1          19         3127         0.861937         24805452             912   
2          21         3105         0.993632         28406535            1211   
3          23         2842         1.153308         31042465            1053   
4          24         2737         0.991386         25510680             858   
...       ...          ...              ...              ...             ...   
1406     7142         3086         0.818212         23395716             813   
1407     7147         3113         0.964179         27416751            1285   
1408     7153         3049         0.853446         23665449             731   
1409     7159         3048         0.819445         23214871             951   
1410     7162         3006         1.122409         31412756            1589   

      avg_play_progress  category_0  ca

In [9]:
# Sauvegarder toutes les caractéristiques pour l'étape suivante
user_features.to_csv('user_features_engineered.csv', index=False)
video_features.to_csv('video_features_engineered.csv', index=False)
interactions_with_features.to_csv('interactions_with_features.csv', index=False)

# Sauvegarder la matrice d'interaction sparse pour les algorithmes de filtrage collaboratif
import scipy.sparse as sp
sp.save_npz('user_item_matrix_sparse.npz', user_item_sparse)

# Sauvegarder les mappings d'index
import pickle
with open('user_item_indices.pkl', 'wb') as f:
    pickle.dump({'user_index': user_index, 'item_index': item_index}, f)