# Projet Big Data : Modélisation

## 1. Préparation des données

In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed_text.csv")

def detect_label(text):
    if isinstance(text, str):
        text = text.lower()
        if 'formula' in text or 'infant formula' in text or 'bottle' in text:
            return 1
        elif 'breastfeeding' in text or 'breast milk' in text or 'nursing' in text:
            return 0
    return np.nan 

df['label'] = df['clean_text'].apply(detect_label)
df = df.dropna(subset=['label'])  # on garde que les textes clairs
df['label'] = df['label'].astype(int)

## 2. Modélisation

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# X et y
X = df['clean_text']
y = df['label']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1,2))),
    ('clf', LogisticRegression())
])

# Entraînement
pipeline.fit(X_train, y_train)

# Prédiction et évaluation
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     32928
           1       1.00      0.98      0.99      1205

    accuracy                           1.00     34133
   macro avg       1.00      0.99      0.99     34133
weighted avg       1.00      1.00      1.00     34133

