In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# === 🔹 1. Chargement du dataset
df = pd.read_csv('path')
X = df['text']
y = df['target']

# === 🔹 2. Split train / validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# === 🔹 3. Vectorisation TF-IDF avec bigrammes
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# === 🔹 4. Modèle : Régression logistique
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_val_vec)

# === 🔹 5. Évaluation
print("🔹 Modèle final : Texte brut + LogisticRegression + ngram (1,2)")
print(classification_report(y_val, y_pred))

# === 🔹 6. Prédictions sur test.csv
test_df = pd.read_csv('C:/Users/toto0/Downloads/tweet/data/test.csv')
X_test = test_df['text']
X_test_vec = vectorizer.transform(X_test)
test_preds = model.predict(X_test_vec)

# === 🔹 7. Création du fichier de soumission
submission = pd.DataFrame({
    "id": test_df["id"],
    "target": test_preds
})
submission.to_csv("submission.csv", index=False)
print("✅ Fichier 'submission.csv' généré avec succès")


🔹 Modèle final : Texte brut + LogisticRegression + ngram (1,2)
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       874
           1       0.81      0.69      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.80      1523

✅ Fichier 'submission.csv' généré avec succès
