In [1]:
import os
import requests
import pandas as pd

CRITERIA = [
    "largely_recommended",
    "reliability",
    "importance",
    "engaging",
    "pedagogy",
    "layman_friendly",
    "entertaining_relaxing",
    "better_habits",
    "diversity_inclusion",
    "backfire_risk",
]

def get_score(row, crit):
    for item in row["criteria_scores"]:
        if item["criteria"] == crit:
            return item["score"]

def api_get_tournesol_scores():
    """Get a dataframe with all videos from tournesol.."""
    true_scores_path = "./true_scores.feather"
    if os.path.exists(true_scores_path):
        return pd.read_feather(true_scores_path)
    else:
        response = requests.get(
            f"https://api.tournesol.app/video/?limit=9999&unsafe=true"
        ).json()
        df = pd.DataFrame.from_dict(response["results"])

        for crit in CRITERIA:
            df[crit] = df.apply(lambda x: get_score(x, crit), axis=1)

        # keep only columns [uid, publication_date, views, language,duration,largely_recommended,reliability,importance,engaging,pedagogy,layman_friendly,entertaining_relaxing,better_habits,diversity_inclusion,backfire_risk]
        # i.e., drop ['name', 'description', 'uploader', 'video_id', rating_n_ratings,rating_n_contributors, criteria_scores]

        df = df.drop(['name', 'description', 'uploader', 'video_id', 'rating_n_ratings', 'rating_n_contributors', 'criteria_scores'], axis=1)

        df.to_feather(true_scores_path)
        return df

In [2]:
df = api_get_tournesol_scores()

In [3]:
df

Unnamed: 0,uid,publication_date,views,language,duration,tournesol_score,largely_recommended,reliability,importance,engaging,pedagogy,layman_friendly,entertaining_relaxing,better_habits,diversity_inclusion,backfire_risk
0,yt:WPPPFqsECz0,2019-12-08T13:30:01Z,14053105.0,en,601.0,64.530014,64.530014,37.374056,54.912836,13.074208,53.062916,69.817297,38.526232,71.009608,30.390545,50.018398
1,yt:XhRbt3R41hs,2022-09-21T14:35:27Z,7387.0,fr,897.0,74.126637,74.126637,60.757514,65.313911,35.098801,50.013407,41.956847,3.505430,73.329356,52.551105,9.933947
2,yt:F1Hq8eVOMHs,2021-11-30T15:01:34Z,7277893.0,en,728.0,71.893912,71.893912,47.696768,68.740991,23.289683,61.494861,66.457654,59.826584,76.613237,2.344735,-36.895233
3,yt:CHoXZO7WFDA,2022-09-09T16:00:15Z,906.0,fr,1351.0,55.163593,55.163593,25.886208,53.962325,56.735831,21.418215,42.698395,13.843014,43.827616,52.637715,38.269454
4,yt:5eW6Eagr9XA,2022-08-02T16:51:13Z,8128333.0,en,1079.0,55.190966,55.190966,45.523548,34.094963,13.985425,51.128767,52.516581,52.306238,36.173648,-2.770159,44.275093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,yt:B4_TG5jv1jY,2022-05-08,1333843.0,fr,633.0,-0.437447,-0.437447,-0.442897,-4.457344,-0.438145,-0.437086,0.000000,4.771152,-3.439854,-0.444741,-3.410529
9995,yt:0fGIGSgRnPg,2023-01-23T17:30:00Z,15392.0,fr,694.0,-4.902161,-4.902161,-18.096523,-12.419257,6.720593,6.841779,8.453078,,,,4.658793
9996,yt:GkU5W8aznQo,2016-05-04,708564.0,fr,405.0,0.000000,0.000000,0.000000,2.244094,-2.298223,0.000000,-4.057497,-4.648210,-2.296059,0.000000,2.308180
9997,yt:GzFG0Cdh8D8,2021-04-19,5533523.0,en,1273.0,,,-2.454155,-2.579469,-1.641094,0.139629,-0.278198,-1.087168,-0.473947,0.868065,-1.248271


In [None]:
import numpy as np
from youtube_dl import YoutubeDL
from pandarallel import pandarallel

metadata_path = "./true_scores_metadata.feather"
if os.path.isfile(metadata_path):
    df = pd.read_feather(metadata_path)
else:
    # pas la bonne librairie (trouver une librairie de multi-threading !) pcq ma tâche est IO-bound.
    pandarallel.initialize(nb_workers=20, progress_bar=True)

    def convert_yt_id_to_url(yt_id):
        """convert 'yt:WPPPFqsECz0' to 'https://www.youtube.com/watch?v=WPPPFqsECz0'"""
        if yt_id.startswith("yt:"):
            return f"https://www.youtube.com/watch?v={yt_id[3:]}"
        else:
            raise ValueError(f"{yt_id} is not a valid youtube id")

    def extract_info(uid, ydl):
        try:
            info_dict = ydl.extract_info(convert_yt_id_to_url(uid), download=False)
        except:
            print(f"uid={uid}")
            return '', [], np.nan
        return info_dict['categories'][0], info_dict['tags'], info_dict['like_count']

    ydl_opts = {
        'quiet': True,
        'ignoreerrors': False,  # ABSURDE : POUR QUE LES ERREURS N'INTERROMPT PAS LE PROCESSUS, IL FAUT DIRE IGNORE_ERRORS = FALSE !?!?
    }
    with YoutubeDL(ydl_opts) as ydl:
        idx = df['category'].isna() & df['tags'].isna() if 'category' in df.columns else ~df['uid'].isna()
        results = df[idx].parallel_apply(lambda x: extract_info(x['uid'], ydl), axis=1)
        df.loc[idx, ['category']] = [r[0] for r in results]
        df.loc[idx, ['tags']] = [r[1] for r in results]
        df.loc[idx, ['like_count']] = [r[2] for r in results]
    df.to_feather(metadata_path)

# todo : nb d'abonnés de la chaîne, nb de commentaires.

In [None]:
# convert date to nb of monthes since 1970, add noise to the date (std = 1 month)
df['date'] = (pd.to_datetime(df['publication_date']) - pd.to_datetime('1970-01-01')) / np.timedelta64(1, 'M') + np.random.normal(0, 1, len(df))
df['date'] = df['date'].astype(float)
data = df.drop(columns=['publication_date'])

# Add noise to duration (std = 20 seconds) to avoid the channel always doing 10:00, ...
data['duration'] = data['duration'] + np.random.normal(0, 1, len(data)) * 20

# One-hot encoding of the channel, categories and tags, language, or project on a line, or embedding
# categorical ⇒ continuous : https://towardsdatascience.com/categorical-embeddings-with-catboost-9f87ceda76a2
# the less frequent values are encoded as 'other'

# add noise to number of views, likes, ... to avoid overfitting if multiple epochs
# data['views'] = data['views'] + np.random.normal(0, 1, len(data)) * 1000
# data['like_count'] = data['like_count'] + np.random.normal(0, 1, len(data)) * 50
# compute ratios/feature engineering BEFORE adding noise