#Data

##Data Loading

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

#TrainData
file_pathTr = "/content/drive/My Drive/TP1-NLP/twitter-2013train-A.txt"
column_names = ['Id', 'sentiment', 'text']
dataTr = pd.read_csv(file_pathTr, sep='\t', names=column_names)

#DevData
file_pathDv = "/content/drive/My Drive/TP1-NLP/twitter-2013dev-A.txt"
column_names = ['Id', 'sentiment', 'text']
dataDv = pd.read_csv(file_pathDv, sep='\t', names=column_names)

#TestData
file_pathTst = "/content/drive/My Drive/TP1-NLP/twitter-2013test-A.txt"
column_names = ['Id', 'sentiment', 'text']
dataTst = pd.read_csv(file_pathTst, sep='\t', names=column_names)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Normalisation & vocab

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem.snowball import EnglishStemmer
import re
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def preprocess_text(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    # faire la lemmatisation avec WordNetLemmatizer et les stop words de NLTK
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(lemmatized_tokens)

dataTr['text'] = dataTr['text'].apply(preprocess_text)
dataDv['text'] = dataDv['text'].apply(preprocess_text)
dataTst['text'] = dataTst['text'].apply(preprocess_text)


corpus = ' '.join(dataTr['text'].astype(str).tolist())
words = word_tokenize(corpus)

word_counts = Counter(words)
vocab = {word: i + 1 for i, word in enumerate(word_counts)}

vocab_size = len(vocab) + 1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


##Préparation des Datasets et Dataloaders

In [None]:
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter


class TwitterDataset(Dataset):
    def __init__(self, dataframe, vocab):
        self.dataframe = dataframe
        self.vocab = vocab

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        sentiment = self.dataframe.iloc[idx]['sentiment']
        sentiment_to_idx = {'positive': 0, 'negative': 1, 'neutral': 2}
        encoded_sentiment = sentiment_to_idx[sentiment]
        encoded_text = [self.vocab.get(word, 0) for word in text.split()]
        return torch.tensor(encoded_text, dtype=torch.long), torch.tensor(encoded_sentiment, dtype=torch.long)

# Création des datasets
train_dataset = TwitterDataset(dataTr, vocab)
dev_dataset = TwitterDataset(dataDv, vocab)
test_dataset = TwitterDataset(dataTst, vocab)

from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    data, labels = zip(*batch)
    data = pad_sequence(data, batch_first=True, padding_value=0)
    labels = torch.tensor(labels, dtype=torch.long)
    return data, labels



# Création des dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_batch)



##Construction du modèle

In [None]:
class SentimentDNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim=3):
        super(SentimentDNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# Création du modèle DNN
vocab_size = len(vocab) + 1
embedding_dim = 120
hidden_dim = 136
output_dim = 3

model = SentimentDNN(vocab_size, embedding_dim, hidden_dim, output_dim)


##Entraînement et Évaluation du modèle

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
num_epochs = 25

# Entraînement du modèle
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True)
        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_function(output, labels)
        loss.backward()
        optimizer.step()

model.eval()
def predict(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs, _ in data_loader:
            inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.tolist())
    return predictions

Y_dev = dataDv['sentiment']
Y_test = dataTst['sentiment']

Y_dev_pred = predict(model, dev_loader)
Y_test_pred = predict(model, test_loader)

idx_to_sentiment = {0: 'positive', 1: 'negative', 2: 'neutral'}
Y_dev_pred_labels = [idx_to_sentiment[idx] for idx in Y_dev_pred]
Y_test_pred_labels = [idx_to_sentiment[idx] for idx in Y_test_pred]

from sklearn.metrics import classification_report

print("Rapport de classification pour les données de développement:")
print(classification_report(Y_dev, Y_dev_pred_labels, target_names=['positive', 'negative', 'neutral']))

print("Rapport de classification pour les données de test:")
print(classification_report(Y_test, Y_test_pred_labels, target_names=['positive', 'negative', 'neutral']))


Rapport de classification pour les données de développement:
              precision    recall  f1-score   support

    positive       0.58      0.32      0.41       340
    negative       0.61      0.72      0.66       739
     neutral       0.59      0.61      0.60       575

    accuracy                           0.60      1654
   macro avg       0.60      0.55      0.56      1654
weighted avg       0.60      0.60      0.59      1654

Rapport de classification pour les données de test:
              precision    recall  f1-score   support

    positive       0.51      0.27      0.35       559
    negative       0.57      0.80      0.67      1513
     neutral       0.71      0.55      0.62      1475

    accuracy                           0.61      3547
   macro avg       0.60      0.54      0.55      3547
weighted avg       0.62      0.61      0.60      3547

