In [None]:
import spacy
from spacy.matcher import Matcher
import regex as re


def detect_historical_facts_v3(text, city_name="", only_street_facts=False):

    historical_facts = []
    nlp = spacy.load("fr_core_news_sm")
    date_pattern = r"\b(?:\d{1,2}\s)?(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre|\d{1,2}(?:er)?\s(?:siècle)|X{1,3}V{0,1}I{0,3}e\s(?:siècle)|\d{4})\b"
    date_matches = [match for match in re.finditer(
        date_pattern, text, re.IGNORECASE)]

    doc = nlp(text)

    matcher = Matcher(nlp.vocab)
    date_token_ids = []

    for match in date_matches:
        date_start, date_end = match.span()
        date_text = match.group()

        char_span = doc.char_span(date_start, date_end)
        if char_span is not None:
            token_start = char_span.start
            date_token_ids.append(token_start)

    pattern = [{"POS": "NUM", "OP": "?"}, {"TEXT": {"REGEX": date_pattern}}, {
        "POS": "ADP", "OP": "?"}, {"POS": "NOUN", "OP": "?"}]
    matcher.add("DATE_PATTERN", [pattern])
    matches = matcher(doc)
    coords = {}
    for match_id, start, end in matches:
        if start in date_token_ids:
            sent = doc[start:end].sent
            if only_street_facts:
                for street_name in ["rue", "avenue", "boulevard", "place", "quai", "allée", "voie", "cours", "impasse", "passage", "route", "square", "chemin", "rond-point", "pont", "cité", "esplanade", "promenade", "voie", "cathédrale", "église", "île", "chapelle"]:
                    if street_name in sent.text.lower():
                        fact = {"date": doc[start:end].text, "entities_loc": [], "entities_per": [], "entities_org": [], "description": sent.text}
                        for ent in sent.ents:
                            if ent.label_ == "LOC":
                                fact["entities_loc"].append(ent.text)
                            elif ent.label_ == "PER":
                                fact["entities_per"].append(ent.text)
                            elif ent.label_ == "ORG":
                                fact["entities_org"].append(ent.text)
                                '''
                                if len(city_name) > 0 and len(fact["entities_loc"]) > 0:
                                    if coords.get(max(fact['entities_loc'] , key=len) + ", " + city_name) is None:
                                        coords[max(fact['entities_loc'] , key=len) + ", " + city_name] = get_coordinates(max(fact['entities_loc'] , key=len) + ", " + city_name)
                                    fact["coordinates"] = coords[max(fact['entities_loc'] , key=len) + ", " + city_name]
                                else:
                                    fact["coordinates"] = (None, None)
                                '''

                        historical_facts.append(fact)
                        break
            else:

                fact = {"date": doc[start:end].text, "entities_loc": [], "entities_per": [], "entities_org": [], "description": sent.text}
                for ent in sent.ents:
                    if ent.label_ == "LOC":
                        fact["entities_loc"].append(ent.text)
                    elif ent.label_ == "PER":
                        fact["entities_per"].append(ent.text)
                    elif ent.label_ == "ORG":
                        fact["entities_org"].append(ent.text)
                '''
                if len(city_name) > 0 and len(fact["entities_loc"]) > 0:
                    if coords.get(max(fact['entities_loc'] , key=len) + ", " + city_name) is None:
                        coords[max(fact['entities_loc'] , key=len) + ", " + city_name] = get_coordinates(max(fact['entities_loc'] , key=len) + ", " + city_name)
                    fact["coordinates"] = coords[max(fact['entities_loc'] , key=len) + ", " + city_name]
                else:
                    fact["coordinates"] = (None, None)
                '''
                historical_facts.append(fact)
    #historical_facts = [fact for fact in historical_facts if np.array([street in fact["description"].lower() for street in ["rue", "avenue", "boulevard", "place", "quai", "allée", "voie", "cours", "impasse", "passage", "route", "square", "chemin", "rond-point", "pont", "cité", "esplanade", "promenade", "voie", "cathédrale","église","île","chapelle"]]).any()]
    return historical_facts


In [None]:
import re, requests
from bs4 import BeautifulSoup
url = "https://www.lagarennecolombes.fr/Histoire%20et%20patrimoine/6042/6164"
def extract_historic_facts_regex(text):
    # Expression régulière pour identifier les dates et les événements historiques
    # Expression régulière pour identifier les dates
    date_pattern = r"(\b(?:\d{1,4}|X{1,3}I{1,3}|X{1,3}V?I{0,3}|V?I{1,3})\s?(?:siècle|année|époque)\b)"

    # Expression régulière pour identifier les lieux (rues, places, avenues, boulevards)
    location_pattern = r"(\b(?:rue|place|avenue|boulevard)[^\n,.!?]*\b)"

    # Combinaison des expressions régulières
    regex = re.compile(f"({date_pattern}|{location_pattern})", re.IGNORECASE)

    # Découper le texte en phrases en utilisant un point, un point d'interrogation ou un point d'exclamation comme séparateur
    sentences = re.split(r"[.!?]", text)
    # Trouver les phrases qui correspondent à l'expression régulière
    historic_facts = [sentence.strip() for sentence in sentences if regex.search(sentence)]

    return historic_facts

response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    content_section = soup.find("div", {"class": "detail"})
    if content_section:
        text = content_section.get_text()
historic_facts = extract_historic_facts_regex(text)

for fact in historic_facts:
    print(fact)


In [None]:
import json
cities = json.load(open("citiesv3.json", "r", encoding="utf-8"))

In [None]:
from googlesearch import search
count = 0

for city in cities:
    if count < 10:
        if city["name"] != "Marseille":
            query = f"histoire de {city['name']}"
            print(query)
            for j in search(query, num_results=1):
                print(j)
            
            count += 1

In [None]:
import requests
from bs4 import BeautifulSoup
import re

def get_section_contents(url, regex_list):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    sections = ""

    for regex in regex_list:
        pattern = re.compile(regex, re.IGNORECASE)

        headers = soup.find_all(lambda tag: tag.name in {"h2"} and pattern.match(tag.text))

        for header in headers:
            content = ""
            for sibling in header.find_next_siblings():
                if sibling.name in {"h2"}:
                    break
                if sibling.name in {"h3", "h4", "h5", "h6"}:
                    continue
                sibling_text = re.sub(r'\[\d+\]', '', sibling.text)
                content += sibling_text.strip() + "\n"

            #section_title = header.text.strip().split('[')[0]
            sections += content
    
    return sections

In [None]:
url = "https://fr.wikipedia.org/wiki/Auvergne-Rh%C3%B4ne-Alpes"


In [None]:
regex_list = [r'(?i)\bhist', r'(?i)\bcultur', r'(?i)\bpatrimo']
count = 0
facts = {}
last_len = 0
for city in cities[:10]:
    sections = get_section_contents(city["wikipediaUrl"], regex_list)
    facts[city["id"]] = detect_historical_facts_v3(sections, only_street_facts=False)
    print(city["name"], len(facts[city["id"]]))
    break

In [None]:
# get coordinates
import requests
def get_coordinates(address):
    try:
        params = {
        "q": address,
        "format": "jsonv2"
        }
        header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"
        }

        response = requests.get("https://nominatim.openstreetmap.org/search", params=params, headers=header)
        result = response.json()[0]

        latitude = result["lat"]
        longitude = result["lon"]
    
    except:
        latitude = None
        longitude = None
    
    return latitude, longitude


In [None]:
get_coordinates("Saint-Denis Saint-Denis")

In [None]:
# equal string
from fuzzywuzzy import fuzz
def equal_string(s1, s2, threshold=90):
    return fuzz.token_set_ratio(s1, s2) > 90

s1 = "boulevard de verdun"
s2 = "avenue de verdun"
equal_string(s1, s2)


In [None]:
def find_facts_wikipedia(url, city_name="", only_streets=False):
    regex_list = [r'(?i)\bhist', r'(?i)\bcultur', r'(?i)\bpatrimo']
    histoire = get_section_contents(url, regex_list)
    historic_facts = detect_historical_facts_v3(histoire, only_street_facts=only_streets)
    return historic_facts

In [None]:
count = 0
facts = {}
last_len = 0
for city in cities[:10]:
    facts[city["id"]] = find_facts_wikipedia(city["wikipediaUrl"], city["name"], False)
    #facts[city["id"]] = [fact for fact in facts[city["id"]] if np.array([street in fact["description"].lower() for street in ["rue", "avenue", "boulevard", "place", "quai", "allée", "voie", "cours", "impasse", "passage", "route", "square", "chemin", "rond-point", "pont", "cité", "esplanade", "promenade", "voie", "cathédrale","église","île","chapelle"]]).any()]
    print(city["name"], len(facts[city["id"]]))
    count += 1
    if count >0:
        break

In [None]:
import json
with open('street_facts_v2.json','r', encoding='utf-8') as f:
    facts = json.load(f)

In [None]:
streetNames = [" rue", "avenue", "boulevard", "place", "quai", "allée", "impasse", "square", "chemin", "rond-point", "pont",  "esplanade", "promenade", "voie", "cathédrale", "église", "île", "chapelle", "collège", "lycée","mairie", "école", "cimetière", "hôpital", "musée", "théâtre", "cinéma", "gare", "piscine", "parc", "jardin"]


In [None]:
size = {
    "rue" : 0.0005,
    "avenue" : 0.001,
    "boulevard" : 0.002,
    "place" : 0.0008,
    "allée" : 0.0002,
    "impasse" : 0.0001,
    "chemin" : 0.0003,
    "cours" : 0.0004,
    "quai" : 0.0006,
    "passage" : 0.0007,
    "square" : 0.0009,
    "route" : 0.0011,
    "rond-point" : 0.001,
    "voie" : 0.0005,
    "promenade" : 0.0002,
    "parc" : 0.001,
    "cité" : 0.0008,
    "esplanade" : 0.001,
    "cathédrale" : 0.0001,
    "église" : 0.0001,
    "île" : 0.001,
    "chapelle" : 0.0001,
    "collège" : 0.00001,
    "lycée" : 0.00001,
    "mairie" : 0.00001,
    "école" : 0.00001,
    "cimetière" : 0.00001,
    "hôpital" : 0.00001,
    "musée" : 0.00001,
    "théâtre" : 0.00001,
    "cinéma" : 0.00001,
    "gare" : 0.00001,
    "piscine" : 0.00001,
    "jardin" : 0.00001
}

In [None]:
with open('street_facts_v3.json','w', encoding='utf-8') as f:
    json.dump(facts, f, ensure_ascii=False, indent=4)

In [None]:
with open("citiesv3.json", "r", encoding="utf-8") as f:
    cities = json.load(f)

In [None]:
count = 0
known_places = {}
street_fact = {}
i = 0
for city in facts.keys():
    street_fact[city] = []
    for fact in facts[city]:
        for street in streetNames:
            if street+" " in fact["description"].lower() or street+"." in fact["description"].lower():
                fact["area"] = size[street] if street in size.keys() else None
                if len(fact["entities_loc"]) != 0:
                    key_place = max(fact["entities_loc"])+ " " + cities[city]["name"]
                    if key_place not in known_places.keys():
                        known_places[key_place] = get_coordinates(key_place)
                    fact["coord"] = known_places[key_place]
                    fact["street_loc"] = True
                    if fact["coord"] == (None, None):
                        fact["coord"] = cities[city]["coordinates"]
                        fact["street_loc"] = False
                else:
                    fact["coord"] = cities[city]["coordinates"]
                    fact["street_loc"] = False
                print(fact["coord"], key_place)
                street_fact[city].append(fact)
                break
    i += 1
    print(i, end="-")
    if i > 0:
        break
    

In [None]:
with open('street_facts_v3.json','w', encoding='utf-8') as f:
    json.dump(facts, f, ensure_ascii=False, indent=4)

In [None]:
count = 0
with open("./facts_dep_reg.json", "r", encoding="utf-8") as f:
    facts = json.load(f)

In [None]:
with open("./regions.json", "r", encoding="utf-8") as f:
    regions = json.load(f)
with open("./departements.json", "r", encoding="utf-8") as f:
    departements = json.load(f)

dep_regs = {**regions, **departements}

In [None]:
known_places = {}
street_fact = {}
non_street_fact = {}
i = 0
for city in facts.keys():
    street_fact[city] = []
    non_street_fact[city] = []
    for fact in facts[city]:
        found_street = False
        for street in streetNames:
            if any(street+ delim in fact["description"].lower() for delim in [" ","."]):
                if(len(fact["entities_loc"]) != 0):
                    key_place = max(fact["entities_loc"])+ " " + dep_regs[city]["name"]
                    if key_place not in known_places.keys():
                        known_places[key_place] = get_coordinates(key_place)
                    fact["coordinates"] = known_places[key_place]
                    fact["street_loc"] = True
                    if fact["coordinates"] == (None, None):
                        fact["coordinates"] = dep_regs[city]["coordinates"]
                        fact["street_loc"] = False
                else:
                    fact["coordinates"] = dep_regs[city]["coordinates"]
                    fact["street_loc"] = False
                fact["area"] = size[street] if street in size.keys() else None
                street_fact[city].append(fact)
                found_street = True
                break
        if not found_street:
            fact["coordinates"] = dep_regs[city]["coordinates"]
            fact["area"] = dep_regs[city]["area"]
            non_street_fact[city].append(fact)


In [None]:
count = 0
for streetfact in street_fact:
    count += len(street_fact[streetfact])
print(count)

for nonstreetfact in non_street_fact:
    count += len(non_street_fact[nonstreetfact])
print(count)

In [None]:
count = 0
for factnez in facts:
    count += len(facts[factnez])
print(count)

In [None]:
with open("street_facts_v2.json", "w", encoding="utf-8") as f:
    json.dump(street_fact, f, ensure_ascii=False, indent=4)

In [None]:
with open("non_street_facts_v2.json", "w", encoding="utf-8") as f:
    json.dump(non_street_fact, f, ensure_ascii=False, indent=4)

In [None]:
import json
facts = json.load(open("./facts_dep_reg.json", "r", encoding="utf-8"))

In [None]:
no_duplicates = {}
count = 0
for city in facts.keys():
    no_duplicates[city] = []
    last_fact = ""
    for fact in facts[city]:
        if fact["description"] in last_fact:            
            no_duplicates[city][-1]["date"] += "" if any([date in no_duplicates[city][-1]["date"] for date in fact["date"].split(" ")]) else " " + fact["date"]
            no_duplicates[city][-1]["entities_loc"] += [fact_ for fact_ in fact["entities_loc"] if fact_ not in no_duplicates[city][-1]["entities_loc"]]
            no_duplicates[city][-1]["entities_per"] += [fact_ for fact_ in fact["entities_per"] if fact_ not in no_duplicates[city][-1]["entities_per"]]
            no_duplicates[city][-1]["entities_org"] += [fact_ for fact_ in fact["entities_org"] if fact_ not in no_duplicates[city][-1]["entities_org"]]
        else:
            no_duplicates[city].append(fact)
            count += 1
        last_fact = fact["description"]
            

In [None]:
with open ("./street_facts_v2.json", "r", encoding="utf-8") as f:
    facts = json.load(f)

In [None]:
with open("./regions.json", "r", encoding="utf-8") as f:
    regions = json.load(f)
with open("./departements.json", "r", encoding="utf-8") as f:
    departements = json.load(f)

In [None]:
count = 0
clean_facts = {}
vague_terms = ["cela", "ceci", "ça","il","elle","ils","elles","on","se","sa","son","ses","leur","leurs"]
for city, facts_list in facts.items():
    clean_facts[city] = []
    for fact in facts_list:
        add_fact = True
        fact["description"] = fact["description"].strip(" \n")
        if fact["description"].split(" ")[0].lower() in vague_terms or fact["description"][-1] not in [".","!","?",";"] or fact["description"][0].islower():
            add_fact = False
        if add_fact:       
            clean_facts[city].append(fact)
            count += 1

In [None]:
count = 0
for city, facts_list in clean_facts.items():
    count += len(facts_list)
print(count)

In [None]:
from pymongo import MongoClient
import json




In [None]:
from pymongo import MongoClient

client = MongoClient(DATABASE_URL)

# Remplacez 'your_database_name' par le nom de votre base de données
db = client.test

# Remplacez 'your_collection_name' par le nom de la collection que vous souhaitez utiliser
facts_collection = db.facts

# Inserez many



In [None]:
# cities facts
import json
with open("facts_dep_reg_clean.json", "r", encoding="utf-8") as f:
    facts_dep_reg = json.load(f)
with open("facts_city_clean.json", "r", encoding="utf-8") as f:
    facts_cities = json.load(f)

facts = {**facts_dep_reg, **facts_cities}

In [None]:
with open("citiesv3.json", "r", encoding="utf-8") as f:
    cities = json.load(f)
with open("./regions.json", "r", encoding="utf-8") as f:
    regions = json.load(f)
with open ("./departements.json", "r", encoding="utf-8") as f:
    departements = json.load(f)

In [None]:
locations_json = {**regions, **departements, **cities}

In [None]:
historical_person_collection = db.historicalPeople
location_collection = db.locations
fact_history_collection = db.factHistoricalPeople
fact_location_collection = db.factLocations


In [None]:
facts_db = []
for city in facts.keys():

    for fact in facts[city]:
        fact_db = {}
        fact_db["title"] = ""
        fact_db["content"] = fact["description"]
        fact_db["from"] = fact["date"]
        fact_db["until"] = fact["date"]
        fact_db["locationId"] = locations_db[city]["_id"]
        fact_db["personsInvolved"] = []
        facts_db.append(fact_db)

In [None]:
import regex as re
historical_ppl = {}
for loc in facts.keys():
    for fact in facts[loc]:
        for person in fact["entities_per"]:
            if len(person)>=5:
                if person not in historical_ppl.keys():
                    if "\n" not in person:
                        historical_ppl[person] = {"name": person, "FactHistoricalPerson": []}


In [None]:
# inserer les locations dans la base de données
results = facts_collection.insert_many(list(facts_db))

In [None]:
facthistoperson_db = []
i = 0
for loc in facts:
    for fact in facts[loc]:
        for person in fact["entities_per"]:
            if person in  historical_ppl.keys():
                facthistoperson_db.append({"factId": facts_db[i]["_id"] ,"historicalPersonId": historical_ppl[person]["_id"]})
        i += 1


In [None]:
for facthisto in facthistoperson_db:
    pers  = historical_person_collection.find_one({"_id": facthisto["historicalPersonId"]})
    fac = facts_collection.find_one({"_id": facthisto["factId"]})
    print(pers["name"] + "//", fac["content"][:50])

                

In [None]:
facthistoperson_db

In [None]:
fact_history_collection.insert_many(list(facthistoperson_db))

In [None]:
res = location_collection.find({"type":"region"})

In [None]:
facts_db

In [None]:
from bson import json_util

# save locations in json file
with open("historical_people.json", "w", encoding="utf-8") as f:
    json.dump(json.loads(json_util.dumps(historical_ppl)), f, ensure_ascii=False, indent=4)

In [None]:
locations_db

In [None]:
for loc in facts:
    for fact in facts[loc]:
        for person in fact["entities_per"]:
            if len(person)>=5:
                print(person + "//", fact["description"][:50])


In [None]:
# delete all facts in the database if the content dont start with a letter 
res = facts_collection.delete_many({"content": {"$regex":r"[^()]*\([^()]*$|^[^()]*\)|^[^()]*\([^()]*[^()]*$"}})



In [None]:
res.deleted_count

In [None]:
# find number of element in fact collection
res = facts_collection.find({})
count = 0
for r in res:
    count += 1
print(count)


commence par maj finit par point
commence par une lettre
pas de \n dedans,
commence par une lettres

In [None]:
facts_list = []
for loc in facts.keys():
    for fact in facts[loc]:
        facts_list.append(fact)

In [None]:
pattern = r"^(Mais|Cependant|Toutefois|Alors|Où|Et|Donc|Or|Ni|Car|Parce que|Ainsi|En outre|De plus|Par ailleurs|En revanche|Néanmoins|Pourtant|De même|En effet|Par conséquent|Par exemple|C'est-à-dire|D'autre part|En somme|Quant à|A contrario|De surcroît|D'ailleurs|Puisque|Bien que|[Cc]e|[Cc]ette|[Cc]et|[Ii]l|[Ee]lle|[Ii]ls|[Ee]lles|[Cc]ela|[Cc]eci|[Çç]a|[Oo]n|[Ss]e|[Ss]a|[Ss]on|[Ss]es|[Ll]eur|[Ll]eurs) "


In [None]:
import regex as re
count = 0
for fact in facts_list:
    if not fact["description"][0].isalpha() or fact["description"].count('\n') > 0 or re.search(pattern, fact["description"]) is not None:
        print(fact["description"])
        print("--"*20)
        count += 1

In [2]:
from prisma import Client
from datetime import datetime

prisma = Client()


RuntimeError: The Client hasn't been generated yet, you must run `prisma generate` before you can use the client.
See https://prisma-client-py.readthedocs.io/en/stable/reference/troubleshooting/#client-has-not-been-generated-yet