In [1]:
import os
import requests
import pandas as pd


def get_score(row, crit):
    for item in row["criteria_scores"]:
        if item["criteria"] == crit:
            return item["score"]

def api_get_tournesol_scores():
    """Get a dataframe with all videos from tournesol.."""
    true_scores_path = "GNNRank/data/tournesol/true_scores.feather"
    if os.path.exists(true_scores_path):
        return pd.read_feather(true_scores_path)
    else:
        response = requests.get(
            f"https://api.tournesol.app/video/?limit=9999&unsafe=true"
        ).json()
        df = pd.DataFrame.from_dict(response["results"])

        for crit in CRITERIA:
            df[crit] = df.apply(lambda x: get_score(x, crit), axis=1)

        df.drop(columns=["criteria_scores"], inplace=True)

        # keep only columns [uid, publication_date, views, language,duration,largely_recommended,reliability,importance,engaging,pedagogy,layman_friendly,entertaining_relaxing,better_habits,diversity_inclusion,backfire_risk]
        # i.e., drop ['name', 'description', 'uploader', 'video_id', rating_n_ratings,rating_n_contributors]

        df.to_feather(true_scores_path)
        return df

In [3]:
df = api_get_tournesol_scores()

In [2]:
import numpy as np
from youtube_dl import YoutubeDL
from pandarallel import pandarallel

metadata_path = "GNNRank/data/tournesol/true_scores_metadata.feather"
if os.path.isfile(metadata_path):
    df = pd.read_feather(metadata_path)
else:
    # pas la bonne librairie (trouver une librairie de multi-threading !) pcq ma tâche est IO-bound.
    pandarallel.initialize(nb_workers=20, progress_bar=True)

    def convert_yt_id_to_url(yt_id):
        """convert 'yt:WPPPFqsECz0' to 'https://www.youtube.com/watch?v=WPPPFqsECz0'"""
        if yt_id.startswith("yt:"):
            return f"https://www.youtube.com/watch?v={yt_id[3:]}"
        else:
            raise ValueError(f"{yt_id} is not a valid youtube id")

    def extract_info(uid, ydl):
        try:
            info_dict = ydl.extract_info(convert_yt_id_to_url(uid), download=False)
        except:
            print(f"uid={uid}")
            return '', [], np.nan
        return info_dict['categories'][0], info_dict['tags'], info_dict['like_count']

    ydl_opts = {
        'quiet': True,
        'ignoreerrors': False,  # ABSURDE : POUR QUE LES ERREURS N'INTERROMPT PAS LE PROCESSUS, IL FAUT DIRE IGNORE_ERRORS = FALSE !?!?
    }
    with YoutubeDL(ydl_opts) as ydl:
        idx = df['category'].isna() & df['tags'].isna() if 'category' in df.columns else ~df['uid'].isna()
        results = df[idx].parallel_apply(lambda x: extract_info(x['uid'], ydl), axis=1)
        df.loc[idx, ['category']] = [r[0] for r in results]
        df.loc[idx, ['tags']] = [r[1] for r in results]
        df.loc[idx, ['like_count']] = [r[2] for r in results]
    df.to_feather(metadata_path)

# todo : nb d'abonnés de la chaîne, récupérer transcription pour analyse de complexité...
# todo : scrap les commentaires, nb de commentaires.

ModuleNotFoundError: No module named 'pandarallel'

In [None]:
# convert date to nb of monthes since 1970, add noise to the date (std = 1 month)
data['date'] = (pd.to_datetime(data['publication_date']) - pd.to_datetime('1970-01-01')) / np.timedelta64(1, 'M') + np.random.normal(0, 1, len(data))
data['date'] = data['date'].astype(float)
data = data.drop(columns=['publication_date'])

# Add noise to duration (std = 20 seconds) to avoid the channel always doing 10:00, ...
data['duration'] = data['duration'] + np.random.normal(0, 1, len(data)) * 20

# One-hot encoding of the channel, categories and tags, language, or project on a line, or embedding
# categorical ⇒ continuous : https://towardsdatascience.com/categorical-embeddings-with-catboost-9f87ceda76a2
# the less frequent values are encoded as 'other'

# add noise to number of views, likes, ... to avoid overfitting if multiple epochs
# data['views'] = data['views'] + np.random.normal(0, 1, len(data)) * 1000
# data['like_count'] = data['like_count'] + np.random.normal(0, 1, len(data)) * 50
# compute ratios/feature engineering BEFORE adding noise