In [5]:
import pandas as pd
import spacy


nlp = spacy.load("en_core_web_sm")


In [6]:
data_recipe = pd.read_csv("data/raw/RAW_recipes.csv")

In [7]:
data_recipe.head()
data_text=data_recipe[['name','description']]
data_text.head()

doc=nlp(data_text['description'][0])

print(doc)

print("\nTokens")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head.text)

print("\n Sentences")
for sent in doc.sents:
    print(sent.text)

print("\n Chunks")
for chunk in doc.noun_chunks:
    print(chunk.text, "→", chunk.root.text)

print("\n Dependencies")
#spacy.displacy.render(doc, style="dep", jupyter=True)
for token in doc:
    print(f"{token.text:<10} ←{token.dep_:<10}– {token.head.text}")

print("\n Pipeline")
for name, component in nlp.pipeline:
    print(name, type(component))


autumn is my favorite time of year to cook! this recipe 
can be prepared either spicy or sweet, your choice!
two of my posted mexican-inspired seasoning mix recipes are offered as suggestions.

Tokens
autumn autumn PROPN NNP nsubj is
is be AUX VBZ ROOT is
my my PRON PRP$ poss time
favorite favorite ADJ JJ amod time
time time NOUN NN attr is
of of ADP IN prep time
year year NOUN NN pobj of
to to PART TO aux cook
cook cook VERB VB relcl time
! ! PUNCT . punct is
this this DET DT det recipe
recipe recipe NOUN NN nsubjpass prepared

 
 SPACE _SP dep recipe
can can AUX MD aux prepared
be be AUX VB auxpass prepared
prepared prepare VERB VBN ROOT prepared
either either CCONJ CC preconj spicy
spicy spicy ADJ JJ dep prepared
or or CCONJ CC cc spicy
sweet sweet ADJ JJ conj spicy
, , PUNCT , punct choice
your your PRON PRP$ poss choice
choice choice NOUN NN npadvmod prepared
! ! PUNCT . punct prepared

 
 SPACE _SP dep !
two two NUM CD nsubjpass offered
of of ADP IN prep two
my my PRON PRP$ poss 

In [None]:

import spacy
import pandas as pd

# Charger modèle sans NER
nlp = spacy.load("en_core_web_sm", disable=["ner"])
stopwords = nlp.Defaults.stop_words

# DataFrame avec texte
# data_recipe = pd.read_csv("recipes.csv")
data_text = data_recipe[['name', 'description']].dropna()

# Fonction optimisée (sans créer doc ici)
def extract_features(doc):
    lemmas = [t.lemma_.lower() for t in doc if t.is_alpha and t.text.lower() not in stopwords]
    noun_chunks = [c.text.lower() for c in doc.noun_chunks]
    pos_counts = doc.count_by(spacy.attrs.POS)
    pos_dict = {doc.vocab[i].text: count for i, count in pos_counts.items()}
    return {"lemmas": lemmas, "noun_chunks": noun_chunks, "pos_counts": pos_dict}

# Traitement par lots (streaming + multi-process)
texts = data_text["description"].tolist()
features = []

for doc in nlp.pipe(texts, batch_size=50, n_process=4):  # n_process = nb de CPU
    features.append(extract_features(doc))

# Assemblage
data_text["features"] = features
data_text["tokens"] = [f["lemmas"] + f["noun_chunks"] for f in features]

# Export
output_df = data_text[['name', 'description', 'tokens']]
output_df.to_csv("data_recipe_features.csv", index=False)
print("✅ Fichier 'data_recipe_features.csv' sauvegardé avec succès !")


KeyboardInterrupt: 