In [6]:
import psycopg
from tqdm import tqdm 
from colorama import Style,Fore
import os
import json
from dotenv import load_dotenv
import time
import requests
import csv

load_dotenv()

True

In [7]:
def openJson(path):
    with open(path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

def saveJson(path,data):
    with open(path, "w", encoding="utf-8") as f:
       json.dump(data, f, ensure_ascii=False, indent=2)
       print(Style.BRIGHT+Fore.GREEN+'\n json saved'+Style.RESET_ALL)

# Update DB with the new tables

In [None]:
conn = psycopg.connect(
    dbname="youtubestay",
    user="postgres",
    password=os.getenv("POSTGRE_PASSWORD"),
    host="localhost",
    port="5432"
)


cur = conn.cursor()

cur.execute("""
    CREATE TABLE entites_spatiales (
        id_entite_spatiale TEXT PRIMARY KEY,
        label TEXT NOT NULL,
        latitude FLOAT NOT NULL,
        longitude FLOAT NOT NULL 
    )
""")

cur.execute("""
    CREATE TABLE entites_spatiales_videos (
        id_entite_spatiale TEXT REFERENCES entites_spatiales(id_entite_spatiale) ON DELETE CASCADE,
        id_video TEXT REFERENCES videos(id_video) ON DELETE CASCADE,
        PRIMARY KEY (id_video, id_entite_spatiale)
    )
""")

cur.execute("""
    CREATE TABLE entites_spatiales_chaines (
        id_entite_spatiale TEXT REFERENCES entites_spatiales(id_entite_spatiale) ON DELETE CASCADE,
        id_chaine TEXT REFERENCES chaines(id_chaine) ON DELETE CASCADE,
        PRIMARY KEY (id_chaine, id_entite_spatiale)
    )
""")


conn.commit()
cur.close()
conn.close()


# Fill the spacial_entities_videos table

## Prepare json

In [None]:
conn = psycopg.connect(
    dbname="youtubestay",
    user="postgres",
    password=os.getenv("POSTGRE_PASSWORD"),
    host="localhost",
    port="5432"
)

cur = conn.cursor()
cur.execute("SELECT id_video,titre,description,tags FROM videos")
rows = cur.fetchall()
cur.close()
conn.close()

videos = []
for row in rows:
    id_video, titre, description, tags = row
    videos.append({
        "id_video": id_video,
        "titre": titre,
        "description": description,
        "tags": tags
    })

In [None]:
len(videos)

In [None]:
saveJson('./jsons/videosForSpacialAnalysis.json',videos)

## Process

In [8]:
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

system_template = """
Tu es un extracteur d'entités géographiques françaises.
À partir d’un texte donné, identifie uniquement les **villes**, **communes** situés en France.
Ne prends **pas** en compte :
- les noms de pays (ex: "France"),
- les noms de personnes,
- les noms de chaînes YouTube, de plateformes (ex: YouTube, Tipeee),
- les noms imaginaires ou poétiques.

Retourne une **liste Python**, en minuscules, sans doublons, contenant uniquement des noms de lieux réels en France.
Pas d'explication, donner la reponse en format string.
"""

user_template = "Contexte : {contexte}"

system_message = SystemMessagePromptTemplate.from_template(system_template)
user_message = HumanMessagePromptTemplate.from_template(user_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message, user_message])

In [9]:
from langchain_ollama import ChatOllama

llm_ollama = ChatOllama(model="llama3.2:3b")
chain_ollama =  chat_prompt | llm_ollama


In [10]:
from langchain_google_genai import ChatGoogleGenerativeAI

{
    id_video = '',
    titre : '',
    description:'',
    tags:''
    +
    output : [
            {
            ent : Ent1
            lat :
            lon : },
            {
            ent : Ent2
            lat :
            lon : },
        ...
    ]
}

In [11]:
def getContext(title,description,tags):
    videoContext = ''
    videoContext+=title
    videoContext+= '\n'+description
    if tags:
        videoContext += '\n'+ ', '.join(tags)
    return videoContext

def getEntityVerification(entity,csvfile,column):
    with open(csvfile, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row[column].strip().lower() == entity:
                return True
    return False

def getLLMresponse(context,suffix):
    llm_gemini = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0,api_key=os.getenv('GEMINI_API_KEY_'+suffix))
    chain_gemini =  chat_prompt | llm_gemini
    response = chain_gemini.invoke({'contexte':context})
    return response
    
def getSpacialEntities(context,suffix):
    response = getLLMresponse(context,suffix)
    
    try:
        entities = eval(response.content.strip())
        if isinstance(entities, list):
            Entities = []
            for e in entities:
                e_cleaned = e.lower().strip()
                if getEntityVerification(e_cleaned,'./csvs/v_commune_2025.csv','NCCENR'):
                    Entities.append(e_cleaned)
            return Entities
    except:
        pass
    return []

def getGeocoding(entity):
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": entity + ", France",
        "format": "json",
        "limit": 1
    }
    headers = {
        "User-Agent": "geo-entity-extractor/1.0"
    }

    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        if data:
            lat = float(data[0]["lat"])
            lon = float(data[0]["lon"])
            return {'lat':lat,
                    'lon':lon}
    except Exception as e:
        print(f"Erreur pour l'entité '{entity}': {e}")
    
    return None

def runAll(jsonfile):
    videos = openJson(jsonfile)
    counter = 0
    MyAPIsuffix = ['MONO','NOUR','NOUR2008','TEXTRA','ZEG']
    index = 0
    apiCounter = 0
    
    for video in tqdm(videos):
        videoContext = getContext(video['titre'],video['description'],video['tags'])
        
        videoSpacialEntities = getSpacialEntities(videoContext,MyAPIsuffix[index])
        
        #print("videoSpacialEntities  ",videoSpacialEntities)
        if len(videoSpacialEntities) > 0:
            output = []
            for ent in videoSpacialEntities:
                geocoding = getGeocoding(ent)
                if geocoding :
                    geocoding['ent']=ent
                    output.append(geocoding)
            if len(output) >0 :
                video['output'] = output

        # Safe Saving 
        counter+= 1
        if counter == 100:
            saveJson("./jsons/output.json",videos)
            counter =0
            
        # API Switching
        apiCounter +=1
        if apiCounter == 13:
            index+=1
            apiCounter = 0
            if index==5:
                print(Style.BRIGHT+Fore.BLUE+'\n sleep for 60s'+Style.RESET_ALL)
                time.sleep(60)
                index=0
            print(Style.BRIGHT+Fore.YELLOW+f'\n API KEY switched to {MyAPIsuffix[index]}'+Style.RESET_ALL)

    # Saving 
    saveJson("./jsons/output.json",videos)

- Test

In [12]:
title = "Autonomie alimentaire.  Être autosuffisant sur petite surface !"
description = """
Découvrez le Pirate de la Permaculture et son autonomie alimentaire sur toute petite surface. Picro arrive à être autosuffisant sur une surface d'à peine 800m2... de quoi rêver.

Adhérez à cette chaîne pour obtenir des avantages :
https://www.youtube.com/channel/UC9Q8WeyCb3yxySC3P3mGpBw/join
Pour me soutenir, suivez ce lien : https://fr.tipeee.com/le-jardin-d-emerveille

Au sommaire :
0:00 - Présentations
0:53 - Quelles productions
4:24 - Comment calculer son autosuffisance !
5:00 - Réduire sa dépendance énergétique
6:20 - Visite du lieu
8:08 - Surface et organisation de la production
10:58 - Gestion de l'eau
11:51 - Jardin forêt ?
13:24 - Le pirate Picro et sa chaîne YouTube.

La chaîne YouTube de Picro : https://www.youtube.com/user/piiicro

Pour me soutenir, suivez ce lien : https://fr.tipeee.com/le-jardin-d-emerveille

Merci à vous tous les permapotes d'avoir regardé cette vidéo. :)
Cliquez sur ce lien pour vous abonner : 
https://www.youtube.com/channel/UC9Q8WeyCb3yxySC3P3mGpBw
"""

tags = ["Autonomie alimentaire. Être autosuffisant sur petite surface !","permaculture","plantes","jardin","biodiversité","agroécologie","potager","des merveilles","Autonomie alimentaire","Produire sa nourriture","comment créer son potager bio","comment démarrer son potager","comment démarrer un potager","comment faire un potager bio","comment préparer son potager","créer son jardin","créer son potager","faire un potager","etre autosuffisant","Autosuffisant sur petite surface"]

videoTestContexte = getContext(title, description, tags)

#print(videoTestContexte)


In [13]:
# Exemple de texte avec des noms de lieux
texte_contenu = """
Lors de mon voyage en Provence, j’ai visité Marseille, le quartier du Panier, Aix-en-Provence 
et un petit village appelé Eygalières. Ensuite, nous sommes allés à Nice et dans le Vieux-Nice.
"""
getSpacialEntities(texte_contenu,'MONO')

['marseille', 'aix-en-provence', 'eygalières', 'nice']

In [None]:
getEntityVerification('aix-en-provence','./csvs/v_commune_2025.csv','NCCENR')

In [None]:
getGeocoding('aix-en-provence')

- Run on All

In [None]:
runAll("./jsons/videosForSpacialAnalysis.json")

  0%|          | 13/42842 [00:11<9:02:53,  1.31it/s] 

[1m[33m
 API KEY switched to NOUR[0m


  0%|          | 26/42842 [00:20<8:24:47,  1.41it/s] 

[1m[33m
 API KEY switched to NOUR2008[0m


  0%|          | 39/42842 [00:29<6:03:29,  1.96it/s] 

[1m[33m
 API KEY switched to TEXTRA[0m


  0%|          | 52/42842 [00:37<7:25:55,  1.60it/s]

[1m[33m
 API KEY switched to ZEG[0m


  0%|          | 64/42842 [00:50<10:10:00,  1.17it/s]

[1m[34m
 sleep for 60s[0m


  0%|          | 65/42842 [01:51<223:20:43, 18.80s/it]

[1m[33m
 API KEY switched to MONO[0m


  0%|          | 78/42842 [02:00<9:09:21,  1.30it/s]  

[1m[33m
 API KEY switched to NOUR[0m


  0%|          | 91/42842 [02:08<6:39:15,  1.78it/s] 

[1m[33m
 API KEY switched to NOUR2008[0m


  0%|          | 100/42842 [02:15<13:22:10,  1.13s/it]

[1m[32m
 json saved[0m


  0%|          | 104/42842 [02:17<7:15:11,  1.64it/s] 

[1m[33m
 API KEY switched to TEXTRA[0m


  0%|          | 117/42842 [02:24<6:09:33,  1.93it/s]

[1m[33m
 API KEY switched to ZEG[0m


  0%|          | 129/42842 [02:31<5:37:24,  2.11it/s]

[1m[34m
 sleep for 60s[0m


  0%|          | 130/42842 [03:31<218:47:01, 18.44s/it]

[1m[33m
 API KEY switched to MONO[0m


  0%|          | 143/42842 [03:39<10:32:52,  1.12it/s] 

[1m[33m
 API KEY switched to NOUR[0m


  0%|          | 156/42842 [03:47<7:50:36,  1.51it/s] 

[1m[33m
 API KEY switched to NOUR2008[0m


  0%|          | 169/42842 [04:11<14:18:15,  1.21s/it]

[1m[33m
 API KEY switched to TEXTRA[0m


  0%|          | 182/42842 [04:20<7:56:01,  1.49it/s] 

[1m[33m
 API KEY switched to ZEG[0m


  0%|          | 194/42842 [04:27<7:36:59,  1.56it/s]

[1m[34m
 sleep for 60s[0m


  0%|          | 195/42842 [05:28<219:47:16, 18.55s/it]

[1m[33m
 API KEY switched to MONO[0m


  0%|          | 200/42842 [05:35<58:51:58,  4.97s/it] 

[1m[32m
 json saved[0m


  0%|          | 208/42842 [05:40<9:34:50,  1.24it/s] 

[1m[33m
 API KEY switched to NOUR[0m


  1%|          | 221/42842 [05:47<6:01:52,  1.96it/s]

[1m[33m
 API KEY switched to NOUR2008[0m


  1%|          | 223/42842 [05:48<7:05:13,  1.67it/s]

### Plot coordinates

In [None]:
import folium

location_data = {
    "lat": 47.9556915,
    "lon": -1.495366,
    "ent": "janzé"
}

map_obj = folium.Map(location=[location_data["lat"], location_data["lon"]], zoom_start=13)

folium.Marker(
    [location_data["lat"], location_data["lon"]],
    popup=location_data["ent"],
    tooltip=location_data["ent"]
).add_to(map_obj)

map_obj.save("map_janze.html")
