# Refinements

## Setup

In [None]:
import json
import os
from tqdm import tqdm
from colorama import Style,Fore
from geopy.geocoders import Nominatim
from gliner import GLiNER
import time
from collections import Counter

In [None]:
def openJson(path):
    
    # Creates the file if not existing
    if not os.path.exists(path):
        with open(path, "w", encoding="utf-8") as file:
            json.dump([], file)
            
    # Open it otherwise
    with open(path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

def saveJson(path,data):
    with open(path, "w", encoding="utf-8") as f:
       json.dump(data, f, ensure_ascii=False, indent=2)
       #print(Style.BRIGHT+Fore.GREEN+'\n json saved'+Style.RESET_ALL)

## Language Detection

In [None]:
from langdetect import detect

In [None]:
def languageDetection(videosPath,savePath):
    videos = openJson(videosPath)
        
    for video in tqdm(videos,'Language Detection...'):
        try:
            lg = detect((video['titre_video']+video['description']).lower())
            if lg == 'fr':
                video['langue'] = 'fr'
            else:
                video['langue'] = 'autre'
        except:
            print(Style.BRIGHT+Fore.YELLOW+f"\n Detection failed for video : {video['id_video']}"+Style.RESET_ALL)
            
    saveJson(savePath,videos)
  

In [None]:
languageDetection('../../collecting/jsons/videos.json','./jsons/videosR1.json')

## Add channel location

In [None]:
def locationAdding(channelsPath,videosPath,savePath):
    videosR1 = openJson(videosPath)
    channels = openJson(channelsPath)
    
    ################### Channels Countries dictionary
    channels_countries = {}
    for channel in channels:
        channels_countries[channel['id_chaine']]= channel['localisation']
     
    ################## Refine the language with the country Code if existent
    for video in tqdm(videosR1,'locationAdding...'):
        if video['langue'] == 'fr':  
            channelId = video['id_chaine']
            try:
                country = channels_countries[channelId]
                if country:
                    video['langue']+=f'-{country}'
            except:
                print(Style.BRIGHT+Fore.YELLOW+f"\n probleme while locationAdding video: {video['id_video']}"+Style.RESET_ALL)
                
    ################## Save the New json file
    saveJson(savePath,videosR1)

In [None]:
locationAdding('../../collecting/jsons/channels.json','./jsons/videosR1.json','./jsons/videosR2.json')

## Add channel location with detection-heuristic

- We still have french videos but we are not sure if it's is fr-France, because channel has unknowen location
- The idea is trying to detect a channel location to be able to decide about the video.

### 1.Seperate Channels with unknown country 

In [None]:
channels = openJson("../../collecting/jsons/channels.json")
len(channels)

In [None]:
channels_location_Unknown = []
for channel in channels :
    if channel['localisation']=="":
        channels_location_Unknown.append(channel)

saveJson("./jsons/channels_location_Unknown.json",channels_location_Unknown)
len(channels_location_Unknown)

### 2.Helper functions

In [None]:
NER = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
geolocator = Nominatim(user_agent="geoapi")

In [None]:
labels = [
    "Localisation",
    "Ville",
    "Commune",
    "Pays",
    "Zone géographique",
    "Continent",
    "Région",
    "Département",
    "Code postal",
    "Quartier",
    "Adresse",
    "Lieu-dit",
    "Coordonnées GPS",
    "Latitude",
    "Longitude",
    "Territoire",
    "Aire urbaine",
    "Espace rural",
    "Zone rurale",
    "Zone urbaine",
    "Périmètre géographique",
    "Localité",
]

def getNer(context):
    locations = []
    results =  NER.predict_entities(context, labels)
    if results :
        for result in results :
            if result['score'] > 0.5 :
                locations.append(result['text'])
    return locations

def getGeocoding(locations):
    countries = []
    details = {}
    try :
        for location in locations:
            Adresse = geolocator.geocode(location, language='fr', addressdetails=True, timeout=10)  
            if Adresse and "country_code" in Adresse.raw['address']:
                code = Adresse.raw['address']['country_code'].upper()
                countries.append(code)
                details[location]=code
            #time.sleep(1)  # respecter la limite Nominatim     
        return countries,details
    except:
        print("Error with GeoCoding")
        time.sleep(1)
        return countries,details

def countRepetitions(countries):
    counts = Counter()
    for country in countries:  
        counts[country] +=1
    return counts 

def getVidoesdata(channelId,videos):
    context = ''
    for video in videos:
        if video['id_chaine']==channelId:
            context += video['titre_video']+'\n'
            context += video['description']+'\n'
    return context

def findCountry(locations):
    countries,details = getGeocoding(locations)
    #print("countries ",countries)
    if len(countries)>0:
        if len(countries) == 1:
            return countries[0],details
        else :
            counts = countRepetitions(countries)
            return counts.most_common(1)[0][0],details
    return '',details

def RefineChannel(channel,videos):

    locationsChannel_1 = getNer(channel['nom_chaine']+'\n'+channel['bio'])
    #print("locationsChannel_1",locationsChannel_1)

    if len(locationsChannel_1)>0:
        channelLocation,details = findCountry(locationsChannel_1)  
        return channelLocation,details
    else :
        context = getVidoesdata(channel['id_chaine'],videos)
        locationsChannel_2 = getNer(context)
        #print("locationsChannel_2 ",locationsChannel_2)
        if len(locationsChannel_2)>0:

            channelLocation,details = findCountry(locationsChannel_2) 
            return channelLocation,details
        else:
            return '',{}
   
def RefineAllChannels(videosPath,channelsUnKPath):

    channels_location_Unknown = openJson(channelsUnKPath)
    videos = openJson(videosPath)
    temp = 0
    for channel in tqdm(channels_location_Unknown,"Channels-Locations Refining..."):
        start = time.time()
        channelLocation,details = RefineChannel(channel,videos)
        end = time.time()
        channel['localisation']=channelLocation
        channel['localisation_details']= details
        channel['localisationTime(s)'] = end-start
       
        temp+=1
        
        # Itermediate Saving (For safety in case the code crashs we don't start over) 
        
        if temp >= 100:
            saveJson(channelsUnKPath,channels_location_Unknown)
            temp = 0

    saveJson(channelsUnKPath,channels_location_Unknown)



### 3.Run All

In [None]:
RefineAllChannels("../../collecting/jsons/videos.json","./jsons/channels_location_Unknown.json")

### 4.Refine the videos

#### Create channelsR1.json

To insert the New locations on the original file

In [None]:
channels = openJson("../../collecting/jsons/channels.json")
channels_location_Unknown = openJson("./jsons/channels_location_Unknown.json")
print(len(channels))
print(len(channels_location_Unknown))

In [None]:
def returnLocation(channelId):
    for channel in channels_location_Unknown:
        if channel['id_chaine']==channelId:
            return channel['localisation']
        
for channel in tqdm(channels):
    if channel['localisation'] == "":
        channel['localisation']  = returnLocation(channel['id_chaine'])

saveJson("./jsons/channelsR1.json",channels)

In [None]:
channelsR1 = openJson("./jsons/channelsR1.json")
len(channelsR1)

#### Create videosR3.json

In [None]:
locationAdding('./jsons/channelsR1.json','./jsons/videosR1.json','./jsons/videosR3.json')

## Results 

In [None]:
videosR3 = openJson("./jsons/videosR3.json")
len(videosR3)

In [None]:
langue_counter = Counter(video['langue'] for video in videosR3)
print("Unique values for the language field with occurrences :")
for langue, count in sorted(langue_counter.items(), key=lambda x: x[1], reverse=True):
    print(f"- {langue} : {count}")


# Filter by Language

In [None]:
videosR3 = openJson("./jsons/videosR3.json")
len(videosR3)

In [None]:
videosF1 = []
for video in videosR3:
    if video['langue']== 'fr-FR':
        videosF1.append(video)
saveJson("./jsons/videosF1.json",videosF1)
len(videosF1)

# Filter TV Channels Videos

In [None]:
# Define a list of TV Channels

chainesTv = [
    "France 2", "France 3", "France 4","France 5","Franceinfo",
    "BFMTV", "C8", "CStar", "Gulli", "Cnews",
    "Canal+", "Planète+", "LCI", "Paris première",
    "6ter", "Arte", "M6", "W9",
    "TFX", "TMC", "NRJ12", "TF1","La Chaîne parlementaire",
    "Chérie 25", "RMC"
]
chainesTv = [nomTV.lower().replace(" ", "") for nomTV in chainesTv]

print(len(chainesTv))
print(chainesTv)

In [None]:
videosF1 = openJson("./jsons/videosF1.json")
channels = openJson("../../collecting/jsons/channels.json")

In [None]:
def getChannelName(channelId):
    for channel in channels:
        if channel['id_chaine']==channelId:
            return channel['nom_chaine'].lower().replace(" ", "")

In [None]:
chainesTVCounter = Counter() # To Count How mutch videos shared by each TV Channel
videosChainesTV = []
for video in videosF1:
    channelName = getChannelName(video['id_chaine'])
    for name in chainesTv:
        if name in channelName:  
            #print(name,' ',channelName,' ',video['id_video'])
            chainesTVCounter[name]+=1
            videosChainesTV.append(video['id_video'])

In [None]:
chainesTVCounter

In [None]:
videosF2 = []
for video in videosF1:
    if video['id_video'] not in videosChainesTV:
        videosF2.append(video)
saveJson("./jsons/videosF2.json",videosF2)

# Filter by Relevance index

- *R.I* = llmScore × α + numQueries × (1-α)
- *llmScore* : is a scoring from 1 to 10 (float value) given by a prompted LLM (Gemini in this case).
- *numQueries* : is the number of search queries that returned the video in the collecting phase.


## Setup

In [None]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv

load_dotenv() 

## LLM

In [None]:
gemini_flash = GoogleGenerativeAI(
    model="gemini-2.0-flash", 
    google_api_key=os.getenv("GEMINI_API_KEY"),
    temperature = 0)

In [None]:
sysprompt = """
# La définition d’autosuffisance
---

L'autosuffisance est la démarche visant à acquérir la capacité de subvenir par soi-même à ses besoins fondamentaux, 
en premier lieu alimentaires par l'autoconsommation – c'est-à-dire produire, récolter et conserver un maximum de sa propre nourriture, 
souvent en privilégiant le bio, le local et le saisonnier. 

L'autosuffisance Plus qu'une simple recherche d'autonomie matérielle, elle représente un engagement pour être moins dépendant du système économique et social extérieur, impliquant des choix concrets comme trouver un lieu propice et le concevoir judicieusement (par exemple en permaculture), 
ainsi qu'un changement dans la manière de valoriser son temps et de consommer, formant ainsi le fondement d'une vie plus autonome.

---

# Votre mission
---

- Décider si une vidéo concerne la thématique de l'autosuffisance en se basant sur ses métadonnées (titre, description, tags).  

- Attribuer un score (float) de 1 à 10 pour évaluer la pertinence de la vidéo par rapport à la thématique de l'autosuffisance.

# Les vidéos pertinentes (Répondre par oui)
---

- Les vlogs, les interviews et les vidéos de partage d'expériences en lien avec l'autosuffisance sont intéressants.  
- Les tutoriels et les vidéos de conseils sont également pertinents.  
- Les vidéos présentant une technique liée à l'autosuffisance ou y contribuant sont utiles.  
- Donner un score plus élevé pour les vidéos qui mentionnent explicitement des mots-clés liés à cette thématique.

# Les vidéos non pertinentes (Répondre par non)
---

- **Les vidéos provenant d'une chaîne TV ou radio** doivent être exclues, même si elles parlent d'autosuffisance.  
- **Les vidéos concernant un emplacement hors de la France** 
- **Les vidéos d'entreprises commercialisant des produits ou des vidéos publicitaires.**  
- **Les vidéos hors sujet**, telles que les webinaires, webconférences, ou présentations commerciales.

---

# Votre réponse  
--- 

- La réponse doit être au format JSON :  

    "decision": "oui ou non",
    "justification": "Justifiez votre décision avec des arguments",
    "score": Votre évaluation en format décimal pour que ce soit plus précis.
"""

userprompt = """
Titre
---
{titre}
Description
---
{description}
tags
---
{tags}
"""

prompt = ChatPromptTemplate([
    ("system", sysprompt),
    ("user", userprompt)
])

def cleanAnswer(answer):
    answer = answer.strip("`")   
    if answer.startswith("json"):
        answer = answer[4:].strip() 
    return  answer

chain = prompt | gemini_flash


In [None]:
# Test 

input = {"titre":"Reportage : Culture hors sol, solution pour l'autosuffisance alimentaire",
         "description":"Face aux défis croissants des produits maraîchers de qualités, certaines familles  font recours à ces pratiques pour assurer une alimentation saine et durable.  \n#information  #culture  #reportage",
         "tags":', '.join( [
     
    ])}
print(input)
print(cleanAnswer(chain.invoke(input)))

## Helper functions

In [62]:
def calculateRI(videosPath,savePath,alpha):
    videos = openJson(videosPath)
    saveCount = 0
    apiCount = 0 # in the free vesion we have only 15 requests/min
    
    for video in tqdm(videos,'Calculating R.I...'):
        input = {"titre":video['titre_video'],
            "description":video['description'],
            "tags":', '.join(video['tags']),}
        try:
            
            LLMResponse = cleanAnswer(chain.invoke(input))
            #print('LLMResponse ',LLMResponse)
            LLMjson = json.loads(LLMResponse)
            numQueries  = len(video['requete'])    
            RI = alpha*LLMjson['score']+(1-alpha)*numQueries
            
            video.update(LLMjson)
            video['RI']=RI
            
            apiCount +=1
            
            # API Delay to avoid blocking by google
            if apiCount == 13:
                print("sleep for 1 min")
                time.sleep(60)
                apiCount =0
            
            # Safety Saving
            saveCount +=1
            if saveCount == 100:
                saveJson(savePath,videos)
                saveCount =0
                
        except Exception as e:
            video['RI'] = -1
            print("Error occurred:", e)
     
    saveJson(savePath,videos)
            
def filterByRI(videosPath,savePath,threshold):
    videos = openJson(videosPath)
    videosF3 = []
    for video in tqdm(videos,'Filtering by R.I...'):
        if video['RI'] > threshold:
            videosF3.append(video)
            
    saveJson(savePath,videosF3)

## Run on ALL

In [63]:
calculateRI('./jsons/videosF2.json','./jsons/videosR4.json',0.7)

Calculating R.I...:  43%|████▎     | 12/28 [00:14<00:19,  1.20s/it]

sleep for 1 min


Calculating R.I...:  89%|████████▉ | 25/28 [01:27<00:03,  1.25s/it]

sleep for 1 min


Calculating R.I...: 100%|██████████| 28/28 [02:30<00:00,  5.37s/it]


In [64]:
filterByRI('./jsons/videosR4.json','./jsons/videosF3.json',4)

Filtering by R.I...: 100%|██████████| 28/28 [00:00<?, ?it/s]
