In [1]:
from flask import Flask, request, jsonify
import json
from sqlalchemy import create_engine, MetaData, Table
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def load_stop_words(path):
    with open(path, "r") as file:
        return json.load(file)
    
def process_table(engine, metadata, table_name):
    table = Table(table_name, metadata, autoload_with=engine)
    with engine.connect() as connection:
        result_set = connection.execute(table.select()).fetchall()
    return result_set

def concatenate_row_values(row):
    return ''.join(str(value) for value in row if isinstance(value, str))    

def get_document_id(doc):
    return doc[0]

def search(query, table_name= 'Toutes'):
    if table_name != "Toutes":
        documents = data[table_name]
        vectorizer = vectorizers[table_name]
        tfidf_matrix = vectorizer.transform([concatenate_row_values(row) for row in documents])
        query_vec = vectorizer.transform([query])
        scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
        ranked_scores = sorted([(score, row) for score, row in zip(scores, documents)], reverse=True, key=lambda x: x[0])
        return ranked_scores[:10]  # 10 meilleurs résultats
    else:
        all_scores = []
        for table in tables:
            documents = data[table]
            vectorizer = vectorizers[table]
            tfidf_matrix = vectorizer.transform([concatenate_row_values(row) for row in documents])
            query_vec = vectorizer.transform([query])
            scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
            all_scores.extend([(score, row, table) for score, row in zip(scores, documents)])
        return sorted(all_scores, key=lambda x: x[0], reverse=True)[:10]

In [3]:
# Initialisation de la connexion à la base de données et du metadata
engine = create_engine("mysql+pymysql://root:root@9mois.ownedge.fr:3306/9mois")
metadata = MetaData()

In [4]:

# Chargement des stop words
stop_words = load_stop_words("stop_words_french.json")

In [5]:
# Traitement des tables et consolidation des données
tables = ['articles', 'food', 'questions', 'recipes']
data = {table: process_table(engine, metadata, table) for table in tables}

# Préparation du vectorisateur TF-IDF
vectorizers = {table: TfidfVectorizer(stop_words=stop_words) for table in tables}
for table in tables:
    docs = [concatenate_row_values(row) for row in data[table]]
    vectorizers[table].fit_transform(docs)




In [5]:
data

NameError: name 'data' is not defined

In [6]:
search('kiwi')

[(0.5577068863160419,
  (8, '# Le kiwi, atout vitaminé de votre grossesse', "## ***🥝Le kiwi,\xa0une bombe de vitamines et de fibres\xa0!***\n\nLe **kiwi** est un fruit star du mois de janvier. Avec son goût acidulé, il\xa0chan ... (1148 characters truncated) ... our l'instant, peu d'études\xa0sur le sujet, ce serait certains polyphénols (des petites molécules anti-oxydantes) qui produiraient un effet laxatif.", datetime.datetime(2023, 6, 10, 15, 31, 38), datetime.datetime(2023, 6, 13, 10, 22, 11), 0, datetime.date(2023, 6, 13), '/images/articles/le-kiwi-atout-vitamine-de-votre-grossesse.jpeg', 1),
  'articles'),
 (0.3939051836558003,
  (28, 'Brochettes de cabillaud sauce kiwi', 30, 'Facile', 'Coût Moyen', '/images/recipes/brochette_cabillaud_sauce_kiwi.png', '> ***L’avis 9 mois à croquer :***\n> \n> - Une recette qui contribue à la **consommation hebdomadaire de poisson blanc** et qui apporte **des vitamin ... (307 characters truncated) ... aud** peut être remplacé par d’autres poisson

In [10]:
query = 'kiwi'
table_choices = 'articles'

if not query:
   print(({'error': 'Aucune requête fournie.'}), 400)

try:
    if table_choices != 'Toutes':
        table_choices = table_choices.split(',')  # Sépare les noms de tables si plusieurs sont fournis
        all_scores = []
        for table_choice in table_choices:
            if table_choice in tables:
                table_results = search(query, table_choice)
                for score, row in table_results:
                    doc_id = get_document_id(row, table_choice)
                    all_scores.append((score, row, table_choice, doc_id))
        all_scores = sorted(all_scores, key=lambda x: x[0], reverse=True)[:10]
    else:
        all_scores = search(query, table_choices)
    
    
    formatted_results = [{
        'score': score,
        'document_id': doc_id,
        'document': concatenate_row_values(row),
        'table': table
    } for score, row, table, doc_id in all_scores]

    print(({
        'query': query,
        'table': table_choices,
        'results': formatted_results
    }))
except Exception as e:
    print(({'error': str(e)}), 500)

{'error': 'get_document_id() takes 1 positional argument but 2 were given'} 500


In [8]:
res = search('kiwi')

In [9]:
type(res[0])

tuple

In [11]:
import nltk

from nltk.stem.snowball import SnowballStemmer



In [13]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer('french')
stemmer.stem("j'ai des nausées et je manque de fer")

"j'ai des nausées et je manque de f"

In [None]:
['jai', 'des' , 'nausées', ]

In [15]:
vect_test = TfidfVectorizer()

analyzer_test = vect_test.build_analyzer()
preprocess_test = vect_test.build_preprocessor()
token_test = vect_test.build_tokenizer()

In [27]:
string_test = "j'ai des nausées et je manque de fer"

# step_1 = analyzer_test(string_test)
# print(step_1)
step_2 = preprocess_test(string_test)
print(step_2)
step_3 = token_test(step_2)
print(step_3)
step_4 = []
for words in step_3:
    stemmed_word = stemmer.stem(words)
    step_4.append(stemmed_word)
print(step_4)
' '.join(step_4)

j'ai des nausées et je manque de fer
['ai', 'des', 'nausées', 'et', 'je', 'manque', 'de', 'fer']
['ai', 'de', 'naus', 'et', 'je', 'manqu', 'de', 'fer']


'ai de naus et je manqu de fer'

In [19]:
analyzer_test(string_test)

['ai', 'des', 'nausées', 'et', 'je', 'manque', 'de', 'fer']

In [4]:
mock_list = ['ai', 'des', 'nausées', 'et', 'je', 'manque', 'de', 'fer']
concatenate_row_values(mock_list)

'aidesnauséesetjemanquedefer'

In [23]:
def analyzer_maison(text):
    vect_test = TfidfVectorizer()
    token_test = vect_test.build_tokenizer()
    tokenized_text = token_test(text)
    stemmer = SnowballStemmer('french')
    output = [stemmer.stem(word) for word in tokenized_text]
    return output

In [24]:
test_vect_maison = TfidfVectorizer(analyzer=analyzer_maison)

In [26]:
test_vect_maison.fit_transform(string_test)

ValueError: Iterable over raw text documents expected, string object received.

In [None]:
def preprocess_text(text):
    stemmer = SnowballStemmer("french")
    tokens = word_tokenize(text, language="french")
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed_tokens)

# Fonction pour traiter une table
def process_table(engine, metadata, table_name):
    table = Table(table_name, metadata, autoload_with=engine)
    with engine.connect() as connection:
        result_set = connection.execute(table.select()).fetchall()
    return result_set

# Préparation du vectorisateur TF-IDF avec la fonction de prétraitement
vectorizers = {table: TfidfVectorizer(stop_words=stop_words, preprocessor=preprocess_text, tokenizer=word_tokenize) for table in tables}
for table in tables:
    docs = [' '.join(str(value) for value in row) for row in data[table]]
    vectorizers[table].fit_transform(docs)