In [None]:
import requests
from bs4 import BeautifulSoup
import spacy
import numpy as np

def fetch_historic_facts_v2(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        # Récupérer et afficher le titre de la page
        title = soup.find("h1", {"id": "firstHeading"}).text
        print(f"Titre de la page : {title}")

        # Trouver la rubrique "Histoire"
        histoire_header = soup.find("span", {"id": "Histoire"})
        if histoire_header:
            histoire_section = histoire_header.find_parent("h2")
            content = histoire_section.find_next_sibling()

        # Afficher le contenu texte de chaque paragraphe de la rubrique "Histoire"
        text = ""
        while content and content.name != "h2":
            if content.name == "p":
                text += content.text.strip() + " "
            content = content.find_next_sibling()
        #print(text)
        nlp = spacy.load("fr_core_news_sm")
        doc = nlp(text)

        historic_facts = []

        for ent in doc.ents:
            print(ent.text, ent.start_char, ent.end_char, ent.label_)
            if ent.label_ in ["DATE", "EVENT"] or np.array([street in ent.text for street in ["rue", "avenue", "boulevard", "place", "quai", "allée", "voie", "cours", "impasse", "passage", "route", "square", "chemin", "rond-point", "pont", "cité", "esplanade", "promenade", "voie"]]).sum() > 0:
                sentence = ent.sent.text.strip()
                if sentence not in historic_facts:
                    historic_facts.append(sentence)

            return historic_facts
        else:
            print("La section de contenu n'a pas été trouvée sur la page.")
            return []
    else:
        print(f"Erreur lors de la récupération de la page. Code d'erreur : {response.status_code}")
        return []

url = "https://fr.wikipedia.org/wiki/La_Garenne-Colombes"
historic_facts = fetch_historic_facts_v2(url)



In [None]:
import spacy
from spacy.matcher import Matcher
import regex as re


def detect_historical_facts_v3(text, city_name="", only_street_facts=False):

    historical_facts = []
    nlp = spacy.load("fr_core_news_sm")
    date_pattern = r"\b(?:\d{1,2}\s)?(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|\d{1,2}(?:er)?\s(?:siècle)|X{1,3}V{0,1}I{0,3}e\s(?:siècle)|\d{4})\b"
    date_matches = [match for match in re.finditer(date_pattern, text, re.IGNORECASE)]

    doc = nlp(text)
    
    matcher = Matcher(nlp.vocab)
    date_token_ids = []

    for match in date_matches:
        date_start, date_end = match.span()
        date_text = match.group()
        
        char_span = doc.char_span(date_start, date_end)
        if char_span is not None:
            token_start = char_span.start
            date_token_ids.append(token_start)
            

    pattern = [{"POS": "NUM", "OP": "?"}, {"TEXT": {"REGEX": date_pattern}}, {"POS": "ADP", "OP": "?"}, {"POS": "NOUN", "OP": "?"}]
    matcher.add("DATE_PATTERN", [pattern])
    matches = matcher(doc)
    coords = {}
    for match_id, start, end in matches:
        if start in date_token_ids:
            sent = doc[start:end].sent
            if only_street_facts:
                for street_name in ["rue", "avenue", "boulevard", "place", "quai", "allée", "voie", "cours", "impasse", "passage", "route", "square", "chemin", "rond-point", "pont", "cité", "esplanade", "promenade", "voie"]:
                    if street_name in sent.text:
                        fact = {"date": doc[start:end].text, "entities_loc": [], "entities_per" :[], "entities_org":[],"description": sent.text}
                        for ent in sent.ents:
                            if ent.label_ == "LOC":
                                fact["entities_loc"].append(ent.text)
                            elif ent.label_ == "PER":
                                fact["entities_per"].append(ent.text)
                            elif ent.label_ == "ORG":
                                fact["entities_org"].append(ent.text)

                        if len(city_name) > 0 and len(fact["entities_loc"]) > 0:
                            if coords.get(max(fact['entities_loc'] , key=len) + ", " + city_name) is None:
                                coords[max(fact['entities_loc'] , key=len) + ", " + city_name] = get_coordinates(max(fact['entities_loc'] , key=len) + ", " + city_name)
                            fact["coordinates"] = coords[max(fact['entities_loc'] , key=len) + ", " + city_name]
                        else:
                            fact["coordinates"] = (None, None)
                        historical_facts.append(fact)
                    break
            else:
                fact = {"date": doc[start:end].text, "entities_loc": [], "entities_per" :[], "entities_org":[],"description": sent.text}
                for ent in sent.ents:
                    if ent.label_ == "LOC":
                        fact["entities_loc"].append(ent.text)
                    elif ent.label_ == "PER":
                        fact["entities_per"].append(ent.text)
                    elif ent.label_ == "ORG":
                        fact["entities_org"].append(ent.text)

                if len(city_name) > 0 and len(fact["entities_loc"]) > 0:
                    if coords.get(max(fact['entities_loc'] , key=len) + ", " + city_name) is None:
                        coords[max(fact['entities_loc'] , key=len) + ", " + city_name] = get_coordinates(max(fact['entities_loc'] , key=len) + ", " + city_name)
                    fact["coordinates"] = coords[max(fact['entities_loc'] , key=len) + ", " + city_name]
                else:
                    fact["coordinates"] = (None, None)
                historical_facts.append(fact)
    return historical_facts


In [None]:
import re, requests
from bs4 import BeautifulSoup
url = "https://www.lagarennecolombes.fr/Histoire%20et%20patrimoine/6042/6164"
def extract_historic_facts_regex(text):
    # Expression régulière pour identifier les dates et les événements historiques
    # Expression régulière pour identifier les dates
    date_pattern = r"(\b(?:\d{1,4}|X{1,3}I{1,3}|X{1,3}V?I{0,3}|V?I{1,3})\s?(?:siècle|année|époque)\b)"

    # Expression régulière pour identifier les lieux (rues, places, avenues, boulevards)
    location_pattern = r"(\b(?:rue|place|avenue|boulevard)[^\n,.!?]*\b)"

    # Combinaison des expressions régulières
    regex = re.compile(f"({date_pattern}|{location_pattern})", re.IGNORECASE)

    # Découper le texte en phrases en utilisant un point, un point d'interrogation ou un point d'exclamation comme séparateur
    sentences = re.split(r"[.!?]", text)
    # Trouver les phrases qui correspondent à l'expression régulière
    historic_facts = [sentence.strip() for sentence in sentences if regex.search(sentence)]

    return historic_facts

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    content_section = soup.find("div", {"class": "detail"})
    if content_section:
        text = content_section.get_text()
historic_facts = extract_historic_facts_regex(text)

for fact in historic_facts:
    print(fact)


In [None]:
import json
cities = json.load(open("citiesv2.json", "r", encoding="utf-8"))

In [None]:
from googlesearch import search
count = 0

for city in cities:
    if count < 10:
        if city["name"] != "Marseille":
            query = f"histoire de {city['name']}"
            print(query)
            for j in search(query, num_results=1):
                print(j)
            
            count += 1

In [None]:
import requests
from bs4 import BeautifulSoup
import re

def get_section_contents(url, regex_list):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    sections = ""

    for regex in regex_list:
        pattern = re.compile(regex, re.IGNORECASE)

        headers = soup.find_all(lambda tag: tag.name in {"h2"} and pattern.match(tag.text))

        for header in headers:
            content = ""
            for sibling in header.find_next_siblings():
                if sibling.name in {"h2"}:
                    break
                if sibling.name in {"h3", "h4", "h5", "h6"}:
                    continue
                sibling_text = re.sub(r'\[\d+\]', '', sibling.text)
                content += sibling_text.strip() + "\n"

            #section_title = header.text.strip().split('[')[0]
            sections += content
    
    return sections

In [None]:
url = "https://fr.wikipedia.org/wiki/La_Garenne-Colombes"
regex_list = [r'(?i)\bhist', r'(?i)\bcultur', r'(?i)\bpatrimo']
sections = get_section_contents(url, regex_list)

In [None]:
historic_facts = detect_historical_facts_v3(sections[list(sections.keys())[1]])

In [None]:
# get coordinates
import requests
def get_coordinates(address):
    try:
        params = {
        "q": address,
        "format": "jsonv2"
        }

        response = requests.get("https://nominatim.openstreetmap.org/search", params=params)
        result = response.json()[0]

        latitude = result["lat"]
        longitude = result["lon"]
    
    except:
        latitude = None
        longitude = None
    
    return latitude, longitude


In [None]:
get_coordinates("place de la liberté, la garenne-colombes")

In [None]:
# equal string
from fuzzywuzzy import fuzz
def equal_string(s1, s2, threshold=90):
    return fuzz.token_set_ratio(s1, s2) > 90

s1 = "boulevard de verdun"
s2 = "avenue de verdun"
equal_string(s1, s2)


In [None]:
def find_facts_wikipedia(url, city_name="", only_streets=False):
    regex_list = [r'(?i)\bhist', r'(?i)\bcultur', r'(?i)\bpatrimo']
    histoire = get_section_contents(url, regex_list)
    historic_facts = detect_historical_facts_v3(histoire, city_name, only_streets)
    return historic_facts

In [None]:
count = 0
facts = {}
last_len = 0
for city in cities[:10]:
    facts[city["id"]] = find_facts_wikipedia(city["wikipediaUrl"], city["name"], True)
    print(city["name"], len(facts[city["id"]]))
    count += 1
    if count > 10:
        break

In [None]:
count = 0
for city in cities[-10:]:
    count += len(facts[city["id"]])
count

In [None]:
with open('facts.json', 'w', encoding='utf-8') as f:
    json.dump(facts, f, ensure_ascii=False, indent=4)