In [None]:
import pandas as pd
import spacy


nlp = spacy.load("en_core_web_sm")


In [None]:
data_recipe = pd.read_csv("data/raw/RAW_recipes.csv")

In [None]:
data_recipe.head()
data_text=data_recipe[['name','description']]
data_text.head()

doc=nlp(data_text['description'][0])

print(doc)

print("\nTokens")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head.text)

print("\n Sentences")
for sent in doc.sents:
    print(sent.text)

print("\n Chunks")
for chunk in doc.noun_chunks:
    print(chunk.text, "→", chunk.root.text)

print("\n Dependencies")
#spacy.displacy.render(doc, style="dep", jupyter=True)
for token in doc:
    print(f"{token.text:<10} ←{token.dep_:<10}– {token.head.text}")

print("\n Pipeline")
for name, component in nlp.pipeline:
    print(name, type(component))


In [None]:
import spacy
import pandas as pd

# Charger modèle sans NER
nlp = spacy.load("en_core_web_sm", disable=["ner"])
stopwords = nlp.Defaults.stop_words

# DataFrame avec texte
data_text = data_recipe[['name', 'description']].dropna()

def extract_features(doc):
    lemmas = [t.lemma_.lower() for t in doc if t.is_alpha and t.text.lower() not in stopwords]
    noun_chunks = [c.text.lower() for c in doc.noun_chunks]
    return lemmas + noun_chunks

# Traitement par lots
tokens_list = []
for doc in nlp.pipe(data_text["description"].tolist(), batch_size=50, n_process=4):
    tokens_list.append(extract_features(doc))

# Ajouter la colonne tokens au DataFrame original
data_recipe.loc[data_text.index, "tokens"] = pd.Series(tokens_list, index=data_text.index)

# Sauvegarder sous un nouveau nom
data_recipe.to_csv("data_recipe_with_tokens.csv", index=False)
print("✅ Fichier 'data_recipe_with_tokens.csv' sauvegardé avec succès !")
