In [25]:
import os
import requests
import pandas as pd
import numpy as np
import pickle

CRITERIA = [
    "largely_recommended",
    "reliability",
    "importance",
    "engaging",
    "pedagogy",
    "layman_friendly",
    "entertaining_relaxing",
    "better_habits",
    "diversity_inclusion",
    "backfire_risk",
]

def get_score(row, crit):
    for item in row["criteria_scores"]:
        if item["criteria"] == crit:
            return item["score"]

true_scores_path = "./true_scores.feather"

def api_get_tournesol_scores():
    """Get a dataframe with all videos from tournesol.."""
    if os.path.exists(true_scores_path):
        return pd.read_feather(true_scores_path)
    else:
        response = requests.get(
            f"https://api.tournesol.app/video/?limit=20000&unsafe=true"
        ).json()
        df = pd.DataFrame.from_dict(response["results"])

        for crit in CRITERIA:
            df[crit] = df.apply(lambda x: get_score(x, crit), axis=1)

        # keep only columns [uid, publication_date, views, language,duration,largely_recommended,reliability,importance,engaging,pedagogy,layman_friendly,entertaining_relaxing,better_habits,diversity_inclusion,backfire_risk]
        # i.e., drop ['name', 'description', 'uploader', 'video_id', rating_n_ratings,rating_n_contributors, criteria_scores]

        df = df.drop(['name', 'description', 'uploader', 'video_id', 'rating_n_ratings', 'rating_n_contributors', 'criteria_scores'], axis=1)

        df.to_feather(true_scores_path)
        return df

In [26]:
df = api_get_tournesol_scores()

In [27]:
df

Unnamed: 0,uid,publication_date,views,language,duration,tournesol_score,largely_recommended,reliability,importance,engaging,pedagogy,layman_friendly,entertaining_relaxing,better_habits,diversity_inclusion,backfire_risk
0,yt:WPPPFqsECz0,2019-12-08T13:30:01Z,14053105.0,en,601.0,66.701176,66.701176,35.970920,59.795943,20.566453,59.321214,73.192768,40.141001,75.161422,35.165539,56.291224
1,yt:XhRbt3R41hs,2022-09-21T14:35:27Z,7387.0,fr,897.0,75.456470,75.456470,64.325034,67.152979,34.519085,52.753667,41.522293,3.831615,75.355569,57.935965,8.088080
2,yt:F1Hq8eVOMHs,2021-11-30T15:01:34Z,7277893.0,en,728.0,73.130258,73.130258,47.169886,71.973155,26.669174,65.454968,71.308568,61.440627,79.391979,10.774976,-43.828059
3,yt:CHoXZO7WFDA,2022-09-09T16:00:15Z,906.0,fr,1351.0,60.085803,60.085803,27.077277,57.981589,59.007726,14.919222,39.300481,15.246405,50.907638,55.404859,44.785123
4,yt:cebFWOlx848,2016-07-21T14:00:03Z,3558310.0,en,505.0,54.353129,54.353129,32.474433,48.056419,36.660664,41.422048,55.370706,57.519469,64.168029,-25.580968,44.967269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19606,yt:Qrbw7ippuD0,2022-01-01,4078.0,fr,461.0,-58.647835,-58.647835,-60.127642,-47.304960,-29.625536,-63.358527,-28.352092,-53.621938,-53.105238,-56.787566,-51.500229
19607,yt:_CSMVQXyjU4,2022-02-06,2442.0,en,116.0,-72.111964,-72.111964,15.595034,-71.013476,-71.872018,-61.107881,-64.946550,-58.393954,-68.187890,-63.848142,-13.396379
19608,yt:-VdXi2LMPyE,2021-12-19,62743.0,fr,794.0,-70.022193,-70.022193,-58.061254,-69.487782,-53.109897,-65.016206,-58.251880,-47.178260,-69.889333,-30.211972,-13.005777
19609,yt:WzvnHbTH0v8,2020-09-08,914540.0,fr,675.0,-58.029473,-58.029473,-62.468430,-52.826996,-59.885927,-64.478566,-48.061383,-55.069572,-57.099281,-44.136548,-58.824974


In [28]:
# import numpy as np
# from youtube_dl import YoutubeDL
# from pandarallel import pandarallel
#
# metadata_path = "./true_scores_metadata.feather"
# if os.path.isfile(metadata_path):
#     df = pd.read_feather(metadata_path)
# else:
#     # pas la bonne librairie (trouver une librairie de multi-threading !) pcq ma tâche est IO-bound.
#     pandarallel.initialize(nb_workers=20, progress_bar=True)
#
#     def convert_yt_id_to_url(yt_id):
#         """convert 'yt:WPPPFqsECz0' to 'https://www.youtube.com/watch?v=WPPPFqsECz0'"""
#         if yt_id.startswith("yt:"):
#             return f"https://www.youtube.com/watch?v={yt_id[3:]}"
#         else:
#             raise ValueError(f"{yt_id} is not a valid youtube id")
#
#     def extract_info(uid, ydl):
#         try:
#             info_dict = ydl.extract_info(convert_yt_id_to_url(uid), download=False)
#         except:
#             print(f"uid={uid}")
#             return '', [], np.nan
#         return info_dict['categories'][0], info_dict['tags'], info_dict['like_count']
#
#     ydl_opts = {
#         'quiet': True,
#         'ignoreerrors': False,  # ABSURDE : POUR QUE LES ERREURS N'INTERROMPT PAS LE PROCESSUS, IL FAUT DIRE IGNORE_ERRORS = FALSE !?!?
#     }
#     with YoutubeDL(ydl_opts) as ydl:
#         idx = df['category'].isna() & df['tags'].isna() if 'category' in df.columns else ~df['uid'].isna()
#         results = df[idx].parallel_apply(lambda x: extract_info(x['uid'], ydl), axis=1)
#         df.loc[idx, ['category']] = [r[0] for r in results]
#         df.loc[idx, ['tags']] = [r[1] for r in results]
#         df.loc[idx, ['like_count']] = [r[2] for r in results]
#     df.to_feather(metadata_path)
#
# # todo : nb d'abonnés de la chaîne, nb de commentaires.
# todo : this code no more working

# Features

In [None]:
# convert date to nb of monthes since 1970
df['date'] = pd.to_datetime(df['publication_date'].str[:10]).apply(lambda x: x - pd.to_datetime('1970-01-01')) / np.timedelta64(1, 'M')
df['date'] = df['date'].astype(float)
df = df.drop(columns=['publication_date'])
df['uid'] = df['uid'].str[3:]

In [33]:
more_than = 2
for criterion in CRITERIA:
    # get dict of video ids for the criterion
    with open(f'./video_ids_{criterion}_{more_than}.pickle', 'rb') as handle:
        video_ids = pickle.load(handle)  # Dict[video_id: str] = index in np array
    video_ids_set = set(video_ids.keys())
    df_criteria = df[df['uid'].isin(video_ids_set)]

    # re-order according to video_ids index
    df_criteria['index'] = df_criteria['uid'].apply(lambda x: video_ids[x])
    df_criteria = df_criteria.sort_values(by='index')

    # replace column scores for this criterion by the rank of the videos according to this criterion
    df_criteria[criterion] = df_criteria[criterion].rank(method='dense', ascending=False).astype(int)

    # export to x.npy
    np.save(f'x_{criterion}_mt{more_than}.npy', df_criteria[['date', 'views', 'duration']].to_numpy())

    # export y.npy
    np.save(f'y_{criterion}_mt{more_than}.npy', df_criteria[criterion].to_numpy())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_criteria['index'] = df_criteria['uid'].apply(lambda x: video_ids[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_criteria['index'] = df_criteria['uid'].apply(lambda x: video_ids[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_criteria['index'] = df_criteria['uid'].apply(lambda x: vid

In [31]:
criterion

'backfire_risk'

In [30]:
df_criteria

Unnamed: 0,uid,views,language,duration,tournesol_score,largely_recommended,reliability,importance,engaging,pedagogy,layman_friendly,entertaining_relaxing,better_habits,diversity_inclusion,backfire_risk,date,index
16355,-1FvAEaE0fc,1723067.0,de,467.0,-21.132857,-21.132857,-25.958183,-20.739765,-23.739011,-17.145341,23.326377,1.483756,-3.290241,12.775827,6147.0,606.895419,0
15642,-3Dn7coSFQc,2970.0,fr,150.0,-1.517643,-1.517643,6.152152,17.394409,34.420948,-26.939165,-41.492335,-24.265193,-8.443612,-1.186827,6775.0,605.646933,1
16029,-4qVv1tzZDU,535630.0,fr,253.0,-20.820962,-20.820962,-19.095467,-17.543920,-25.107167,-11.778914,-18.182725,-3.347612,-20.062409,18.383015,1162.0,592.932093,2
17887,-5_qq4hX0eQ,11028.0,fr,949.0,-44.566316,-44.566316,-0.458301,-47.509255,-49.833980,20.312486,7.990917,32.229806,-48.764870,-27.751775,4647.0,626.378365,3
9366,-6q04I3xdaQ,25653.0,fr,943.0,-23.934877,-23.934877,-34.970147,25.941889,28.296046,-32.810149,30.215488,-23.886093,26.465291,34.713555,6321.0,627.725415,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18268,zygtkJfNkJg,366541.0,fr,7405.0,-30.989888,-30.989888,1.338272,-39.797626,-32.320081,-50.162639,-37.420809,-18.282531,-7.980385,38.909652,4827.0,629.926693,7476
18355,zyiBUZJDYmc,260115.0,fr,2511.0,-4.679709,-4.679709,-32.022317,-10.227579,-4.207110,-34.821470,-31.006468,-24.849249,-21.998123,-15.517603,5042.0,626.509785,7477
14992,zysL_lkdtys,5772278.0,en,539.0,28.769200,28.769200,-26.702920,20.188369,-23.536769,-24.584375,-4.051361,-27.118440,12.781870,-1.274921,4703.0,624.308507,7478
16423,zyzlVdLChXM,14001.0,fr,2816.0,-23.425994,-23.425994,-11.862752,-19.668288,-21.989009,-29.363019,10.554801,1.605010,-26.732743,21.463659,4140.0,633.409310,7479


In [32]:
video_ids

{'-1FvAEaE0fc': 0,
 '-3Dn7coSFQc': 1,
 '-4qVv1tzZDU': 2,
 '-5_qq4hX0eQ': 3,
 '-6q04I3xdaQ': 4,
 '-79HGfWmH_w': 5,
 '-8Q2OojFPK0': 6,
 '-9Jv4aO9y70': 7,
 '-9xNrBQpFt4': 8,
 '-ARJjb6tZrc': 9,
 '-BdZPFzH2JY': 10,
 '-BnSRVqKZ6g': 11,
 '-CNyiSwzCIg': 12,
 '-DfX3_CO2bU': 13,
 '-FBwZtuJtMw': 14,
 '-FaWBtxJ37M': 15,
 '-GCCk6Qk0pQ': 16,
 '-GToXhPm2Zo': 17,
 '-G_nUBKrjjQ': 18,
 '-GjOKU3t2ng': 19,
 '-H-ERnb8H3M': 20,
 '-I_xQJYoUtk': 21,
 '-JAFb2bYJSs': 22,
 '-JRURYTfBXQ': 23,
 '-KEoDTe19gs': 24,
 '-LEg3TU9-kU': 25,
 '-LKVUarhtvE': 26,
 '-LUTdWQZY-Q': 27,
 '-MTRxRO5SRA': 28,
 '-O5kNPlUV7w': 29,
 '-Od0UgFojYY': 30,
 '-OqrZG-EBaQ': 31,
 '-PuSllvcdRc': 32,
 '-QRYwHrH3CU': 33,
 '-RAdShCdvh0': 34,
 '-RdOwhmqP5s': 35,
 '-SBNs8KuQlc': 36,
 '-SgP7rTSSWo': 37,
 '-TQsgm2tPW0': 38,
 '-TxzW4eklEU': 39,
 '-UDLorjyWg0': 40,
 '-UNjkHeLrhU': 41,
 '-VdXi2LMPyE': 42,
 '-VxR1UzdrQM': 43,
 '-WQ2QnLEEYM': 44,
 '-XKoel3M5lI': 45,
 '-XN3aZ3ZKWY': 46,
 '-YW072jxCv0': 47,
 '-YkpZ-_e4mk': 48,
 '-Z0S0Z8lUTg': 49,
 '-Z7fO7mA