In [None]:
import pandas as pd
import spacy


nlp = spacy.load("en_core_web_sm")


In [None]:
data_recipe = pd.read_csv("data/raw/RAW_recipes.csv")

In [None]:
data_recipe.head()
data_text=data_recipe[['name','description']]
data_text.head()

doc=nlp(data_text['description'][0])

print(doc)

print("\nTokens")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head.text)

print("\n Sentences")
for sent in doc.sents:
    print(sent.text)

print("\n Chunks")
for chunk in doc.noun_chunks:
    print(chunk.text, "→", chunk.root.text)

print("\n Dependencies")
#spacy.displacy.render(doc, style="dep", jupyter=True)
for token in doc:
    print(f"{token.text:<10} ←{token.dep_:<10}– {token.head.text}")

print("\n Pipeline")
for name, component in nlp.pipeline:
    print(name, type(component))


In [None]:
import spacy
import pandas as pd

# Charger modèle sans NER
nlp = spacy.load("en_core_web_sm", disable=["ner"])
stopwords = nlp.Defaults.stop_words

# DataFrame avec texte
data_text = data_recipe[['name', 'description']].dropna()

def extract_features(doc):
    lemmas = [t.lemma_.lower() for t in doc if t.is_alpha and t.text.lower() not in stopwords]
    noun_chunks = [c.text.lower() for c in doc.noun_chunks]
    return lemmas + noun_chunks

# Traitement par lots
tokens_list = []
for doc in nlp.pipe(data_text["description"].tolist(), batch_size=50, n_process=4):
    tokens_list.append(extract_features(doc))

# Ajouter la colonne tokens au DataFrame original
data_recipe.loc[data_text.index, "tokens"] = pd.Series(tokens_list, index=data_text.index)

# Sauvegarder sous un nouveau nom
data_recipe.to_csv("data_recipe_with_tokens.csv", index=False)
print("✅ Fichier 'data_recipe_with_tokens.csv' sauvegardé avec succès !")


In [None]:
data_recipe = pd.read_csv("data/raw/RAW_recipes.csv")

In [None]:
data_recipe.head()
data_text = data_recipe[["name", "description"]]
data_text.head()

doc = nlp(data_text["description"][0])

print(doc)

print("\nTokens")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head.text)

print("\n Sentences")
for sent in doc.sents:
    print(sent.text)

print("\n Chunks")
for chunk in doc.noun_chunks:
    print(chunk.text, "→", chunk.root.text)

print("\n Dependencies")
# spacy.displacy.render(doc, style="dep", jupyter=True)
for token in doc:
    print(f"{token.text:<10} ←{token.dep_:<10}– {token.head.text}")

print("\n Pipeline")
for name, component in nlp.pipeline:
    print(name, type(component))

In [None]:
# Small example
data_text["NLP_Desc"] = data_text["description"]
data_text.head()
for i in range(5):
    data_text["NLP_Desc"][i] = nlp(data_text["description"][i])

data_text.head(5)
data_tfidf = data_text.iloc[:5]

print(data_tfidf)

In [None]:
data_tfidf["NLP_Selection"] = data_tfidf["NLP_Desc"]
for i in range(5):
    lemmes = [
        token.lemma_
        for token in data_tfidf["NLP_Desc"][i]
        if token.pos_ in ["NOUN", "ADJ"]
    ]
    # print(lemmes)
    data_tfidf["NLP_Selection"][i] = lemmes
# print(data_tfidf)

data_tfidf.head(5)

In [None]:
import numpy as np

texts = [t for t in data_tfidf["NLP_Selection"].tolist() if len(t) > 0]
n_samples = len(texts)

# If the vocabulary is not known, we need to build it
words = set()
for t in texts:
    words = words.union(set(t))
n_features = len(words)
vocabulary = dict(zip(words, range(n_features)))

# Creating the matrix counts
counts = np.zeros((n_samples, n_features))

# Filling the matrix by iterating over the documents and counting the words
for k, t in enumerate(texts):
    for w in t:
        counts[k][vocabulary[w]] += 1.0

In [None]:
searched_text = ["spicy", "sausage"]
searched_vector = np.zeros(len(vocabulary), dtype=int)

# Mettre +1 dans la colonne correspondant à chaque mot présent
for word in searched_text:
    if word in vocabulary:  # Vérifie que le mot existe dans le vocabulaire
        searched_vector[vocabulary[word]] += 1

print(searched_vector)

In [None]:
print(texts)
print(vocabulary)
print(counts)
print(counts.shape)

In [None]:
from sklearn.preprocessing import normalize

d = counts.shape[0]
# in_doc consiste à compter le nombre de valeurs positives par colonne, soit le nombre de documents dans lequel le mot apparait au moins une fois
in_doc = np.count_nonzero(counts > 0, axis=0)
# idf est une division terme à terme entre d et in_doc à laquelle on applique le log.
idf = np.log(d / in_doc)

# TF

# sum_vec est la somme des termes d'une ligne, soit le nombre de mots pour chaque document.
sum_vec = counts.sum(axis=1, keepdims=True) + 10**-5

# tf est la division pour chaque coefficient, de l'occurrence d'un mot dans le document sur l'ensemble des mots d'un document, soit bow/sum_vec
tf = np.divide(counts, sum_vec)

# tf_idf est le produit de tf et idf, que l'on normalise en prévision de la suite.
tf_idf = tf * idf
normalize(tf_idf)

print(tf_idf)

In [None]:
normalized_searched_vector = (
    searched_vector / searched_vector.sum(axis=0, keepdims=True) * idf
)
print(normalized_searched_vector.shape)
print(normalized_searched_vector)

In [None]:
def euclidean(u, v):
    return np.linalg.norm(u - v)

def length_norm(u):
    return u / np.sqrt(u.dot(u))

def cosine(u, v):
    return 1.0 - length_norm(u).dot(length_norm(v))

In [None]:
from sklearn.neighbors import NearestNeighbors

X_input = normalized_searched_vector.reshape(1, -1)
print(X_input)

nn = NearestNeighbors(n_neighbors=5, metric=cosine)
nn.fit(tf_idf)
distances, indices = nn.kneighbors(X_input)

print(indices)
print(distances)