<a href="https://colab.research.google.com/github/KlausKUEKOU/KlausKUEKOU/blob/main/Textclassification_test_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import os

# Assurez-vous d'avoir téléchargé les ressources nécessaires
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Définir les stop words pour l'anglais
stop_words = set(stopwords.words('english'))

# Gestion des données
class DataHandler:
    def __init__(self, data_path):
        if not os.path.exists(data_path):
            raise FileNotFoundError(f"Le fichier {data_path} est introuvable. Veuillez le télécharger.")
        self.data_path = data_path

    def load_data(self):
        data = pd.read_csv(self.data_path)
        data = data[['Text', 'Score']]  # Conserver les colonnes pertinentes
        data = data.rename(columns={'Text': 'text', 'Score': 'label'})
        data['label'] = data['label'].apply(self.map_score_to_category)
        return data

    @staticmethod
    def map_score_to_category(score):
        if score <= 2:
            return 0  # Insatisfait
        elif score == 3:
            return 1  # Neutre
        else:
            return 2  # Satisfait

# Charger les données avec DataHandler
data_path = "./Reviews.csv"
data_handler = DataHandler(data_path)
data = data_handler.load_data()

# Diviser les données
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

# Modèle PyTorch simple
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

# Fonction de tokenization

def tokenize_words(texts):
    return [[word for word in nltk.word_tokenize(text) if word.lower() not in stop_words] for text in texts]

def tokenize_subwords(texts, tokenizer):
    return [[subword for subword in tokenizer.tokenize(text) if subword.lower() not in stop_words] for text in texts]

def tokenize_characters(texts):
    return [[char for char in text if char.lower() not in stop_words] for text in texts]

def tokenize_sentences(texts):
    return [[sentence for sentence in nltk.sent_tokenize(text) if all(word.lower() not in stop_words for word in nltk.word_tokenize(sentence))] for text in texts]

def tokenize_spaces(texts):
    return [[word for word in text.split() if word.lower() not in stop_words] for text in texts]

# Partie 1 : Tokenization
def tokenize_and_vectorize(train_texts, test_texts, tokenize_func):
    train_tokens = [" ".join(tokens) for tokens in tokenize_func(train_texts)]
    test_tokens = [" ".join(tokens) for tokens in tokenize_func(test_texts)]

    vectorizer = CountVectorizer()
    train_features = vectorizer.fit_transform(train_tokens)
    test_features = vectorizer.transform(test_tokens)

    return train_features, test_features, vectorizer

# Partie 2 : Conversion en Tensors
def convert_to_tensors(train_features, test_features, train_labels, test_labels):
    train_features = torch.tensor(train_features.toarray(), dtype=torch.float32)
    test_features = torch.tensor(test_features.toarray(), dtype=torch.float32)
    train_labels = torch.tensor(train_labels.values, dtype=torch.long)
    test_labels = torch.tensor(test_labels.values, dtype=torch.long)

    return train_features, test_features, train_labels, test_labels

# Partie 3 : Entraînement du modèle
def train_model(train_features, train_labels, input_dim, hidden_dim, output_dim):
    train_data = TensorDataset(train_features, train_labels)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

    model = TextClassifier(input_dim, hidden_dim, output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    epochs = 10
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_features, batch_labels in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

    return model

# Partie 4 : Évaluation du modèle
def evaluate_model(model, test_features, test_labels):
    model.eval()
    with torch.no_grad():
        predictions = model(test_features).argmax(dim=1)
        accuracy = accuracy_score(test_labels, predictions)
        precision = precision_score(test_labels, predictions, average='weighted')
        recall = recall_score(test_labels, predictions, average='weighted')
        f1 = f1_score(test_labels, predictions, average='weighted')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

# Test des méthodes de tokenization
methods = {
    "Mot par mot": tokenize_words,
    "Sous-mots (BERT)": lambda texts: tokenize_subwords(texts, AutoTokenizer.from_pretrained("bert-base-uncased")),
    "Caractères": tokenize_characters,
    "Phrases": tokenize_sentences,
    "Espaces": tokenize_spaces,
}

# Comparaison des performances
results = {}
for method, func in methods.items():
    print(f"\n--- Méthode : {method} ---")

    # Tokenization et vectorisation
    train_features, test_features, vectorizer = tokenize_and_vectorize(train_texts, test_texts, func)
    print("Nombre de caractéristiques vectorisées :", len(vectorizer.get_feature_names_out()))

    # Conversion en Tensors
    train_features, test_features, train_labels_tensor, test_labels_tensor = convert_to_tensors(
        train_features, test_features, train_labels, test_labels
    )

    # Entraînement du modèle
    input_dim = train_features.shape[1]
    hidden_dim = 100
    output_dim = len(set(train_labels.values))
    model = train_model(train_features, train_labels_tensor, input_dim, hidden_dim, output_dim)

    # Évaluation
    report = evaluate_model(model, test_features, test_labels_tensor)
    print("Rapport de classification :", report)

    results[method] = report

# Sauvegarder les résultats dans un fichier texte
with open("results.txt", "w") as f:
    for method, metrics in results.items():
        f.write(f"\n--- Méthode : {method} ---\n")
        for metric, value in metrics.items():
            f.write(f"{metric}: {value:.4f}\n")

In [1]:
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import os


In [2]:
# Assurez-vous d'avoir téléchargé les ressources nécessaires
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords

# Définir les stop words pour l'anglais
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# Gestion des données
class DataHandler:
    def __init__(self, data_path):
        if not os.path.exists(data_path):
            raise FileNotFoundError(f"Le fichier {data_path} est introuvable. Veuillez le télécharger.")
        self.data_path = data_path

    def load_data(self):
        data = pd.read_csv(self.data_path)
        data = data[['Text', 'Score']]  # Conserver les colonnes pertinentes
        data = data.rename(columns={'Text': 'text', 'Score': 'label'})
        data['label'] = data['label'].apply(self.map_score_to_category)
        return data

    @staticmethod
    def map_score_to_category(score):
        if score <= 2:
            return 0  # Insatisfait
        elif score == 3:
            return 1  # Neutre
        else:
            return 2  # Satisfait

In [4]:
# Charger les données avec DataHandler
data_path = "/content/drive/MyDrive/Reviews.csv"
data_handler = DataHandler(data_path)
data = data_handler.load_data()


In [13]:
# Diviser les données
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)


In [5]:
# Modèle PyTorch simple
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

In [17]:
# Fonction de tokenization

def tokenize_words(texts):
    return [[word for word in nltk.word_tokenize(text) if word.lower() not in stop_words] for text in texts]

def tokenize_subwords(texts, tokenizer):
    return [[subword for subword in tokenizer.tokenize(text) if subword.lower() not in stop_words] for text in texts]

def tokenize_characters(texts):
    return [[char for char in text ] for text in texts]

def tokenize_sentences(texts):
    return [[sentence for sentence in nltk.sent_tokenize(text) if all(word.lower() not in stop_words for word in nltk.word_tokenize(sentence))] for text in texts]

def tokenize_spaces(texts):
    return [[word for word in text.split() if word.lower() not in stop_words] for text in texts]


In [7]:
# Partie 1 : Tokenization
def tokenize_and_vectorize(train_texts, test_texts, tokenize_func):
    train_tokens = [" ".join(tokens) for tokens in tokenize_func(train_texts)]
    test_tokens = [" ".join(tokens) for tokens in tokenize_func(test_texts)]

    vectorizer = CountVectorizer()
    train_features = vectorizer.fit_transform(train_tokens)
    test_features = vectorizer.transform(test_tokens)

    return train_features, test_features, vectorizer

In [8]:
# Partie 2 : Conversion en Tensors
def convert_to_tensors(train_features, test_features, train_labels, test_labels):
    train_features = torch.tensor(train_features.toarray(), dtype=torch.float32)
    test_features = torch.tensor(test_features.toarray(), dtype=torch.float32)
    train_labels = torch.tensor(train_labels.values, dtype=torch.long)
    test_labels = torch.tensor(test_labels.values, dtype=torch.long)

    return train_features, test_features, train_labels, test_labels


In [9]:
# Partie 3 : Entraînement du modèle
def train_model(train_features, train_labels, input_dim, hidden_dim, output_dim):
    train_data = TensorDataset(train_features, train_labels)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

    model = TextClassifier(input_dim, hidden_dim, output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    epochs = 10
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_features, batch_labels in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

    return model


In [10]:
# Partie 4 : Évaluation du modèle
def evaluate_model(model, test_features, test_labels):
    model.eval()
    with torch.no_grad():
        predictions = model(test_features).argmax(dim=1)
        accuracy = accuracy_score(test_labels, predictions)
        precision = precision_score(test_labels, predictions, average='weighted')
        recall = recall_score(test_labels, predictions, average='weighted')
        f1 = f1_score(test_labels, predictions, average='weighted')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }


In [11]:
# Test des méthodes de tokenization
methods = {
    "Mot par mot": tokenize_words,
    "Sous-mots (BERT)": lambda texts: tokenize_subwords(texts, AutoTokenizer.from_pretrained("bert-base-uncased")),
    "Caractères": tokenize_characters,
    "Phrases": tokenize_sentences,
    "Espaces": tokenize_spaces,
}


In [None]:
# Comparaison des performances
results = {}
for method, func in methods.items():
    print(f"\n--- Méthode : {method} ---")

    # Tokenization et vectorisation
    train_features, test_features, vectorizer = tokenize_and_vectorize(train_texts, test_texts, func)
    print("Nombre de caractéristiques vectorisées :", len(vectorizer.get_feature_names_out()))




--- Méthode : Mot par mot ---


In [None]:
for method, func in methods.items():
    print(f"\n--- Méthode : {method} ---")

        # Conversion en Tensors
    train_features, test_features, train_labels_tensor, test_labels_tensor = convert_to_tensors(
        train_features, test_features, train_labels, test_labels
    )


In [None]:
for method, func in methods.items():
    print(f"\n--- Méthode : {method} ---")

     # Entraînement du modèle
    input_dim = train_features.shape[1]
    hidden_dim = 100
    output_dim = len(set(train_labels.values))
    model = train_model(train_features, train_labels_tensor, input_dim, hidden_dim, output_dim)

In [None]:
for method, func in methods.items():
    print(f"\n--- Méthode : {method} ---")

    # Évaluation
    report = evaluate_model(model, test_features, test_labels_tensor)
    print("Rapport de classification :", report)

    results[method] = report

In [None]:

# Sauvegarder les résultats dans un fichier texte
with open("results.txt", "w") as f:
    for method, metrics in results.items():
        f.write(f"\n--- Méthode : {method} ---\n")
        for metric, value in metrics.items():
            f.write(f"{metric}: {value:.4f}\n")