In [45]:
import pandas as pd

#Data

##Chargement

In [46]:
from google.colab import drive
drive.mount('/content/drive')

#TrainData
file_pathTr = "/content/drive/My Drive/TP1-NLP/twitter-2013train-A.txt"
column_names = ['Id', 'sentiment', 'text']
dataTr = pd.read_csv(file_pathTr, sep='\t', names=column_names)

#DevData
file_pathDv = "/content/drive/My Drive/TP1-NLP/twitter-2013dev-A.txt"
column_names = ['Id', 'sentiment', 'text']
dataDv = pd.read_csv(file_pathDv, sep='\t', names=column_names)

#TestData
file_pathTst = "/content/drive/My Drive/TP1-NLP/twitter-2013test-A.txt"
column_names = ['Id', 'sentiment', 'text']
dataTst = pd.read_csv(file_pathTst, sep='\t', names=column_names)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Extraction

In [47]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.stem.snowball import EnglishStemmer
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')

def preprocess_text(text):
    # Suppression des liens, mentions, hashtags... et mettant les données en miniscule
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    # faire la lemmatisation avec WordNetLemmatizer et les stop words de NLTK
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(lemmatized_tokens)

# Application du preprocess
dataTr['text'] = dataTr['text'].apply(preprocess_text)
dataDv['text'] = dataDv['text'].apply(preprocess_text)
dataTst['text'] = dataTst['text'].apply(preprocess_text)


corpus = ' '.join(dataTr['text'].astype(str).tolist())
words = word_tokenize(corpus)
word_counts = Counter(words)

words_at_least_once = [word for word, count in word_counts.items() if count >= 1]

print("Mots qui apparaissent au moins une fois dans le fichier :")
print(words_at_least_once)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Mots qui apparaissent au moins une fois dans le fichier :


##Assigner

In [48]:
word_to_num_dict = {word: idx + 1 for idx, word in enumerate(words_at_least_once)}
print(word_to_num_dict)



##Décompter et Convertisser

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
#Pour ajuster l'index qui commence par 1, pour être compatible avec countVectorizer
adjusted_vocabulary = {word: (idx - 1) for word, idx in word_to_num_dict.items()}
vectorizer = CountVectorizer(vocabulary=adjusted_vocabulary)

X_train = vectorizer.fit_transform(dataTr['text'])
Y_train = dataTr['sentiment']

X_dev = vectorizer.transform(dataDv['text'])
Y_dev = dataDv['sentiment']

X_test = vectorizer.transform(dataTst['text'])
Y_test = dataTst['sentiment']

##Appliquer

In [50]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

svm_classifier = SVC()
svm_classifier.fit(X_train, Y_train)

Y_dev_pred = svm_classifier.predict(X_dev)
Y_test_pred = svm_classifier.predict(X_test)

print("Evaluation du jeu de développement:\n", classification_report(Y_dev, Y_dev_pred))
print("Evaluation du jeu de test:\n", classification_report(Y_test, Y_test_pred))

Evaluation du jeu de développement:
               precision    recall  f1-score   support

    negative       0.70      0.11      0.19       340
     neutral       0.57      0.89      0.70       739
    positive       0.69      0.55      0.61       575

    accuracy                           0.61      1654
   macro avg       0.65      0.52      0.50      1654
weighted avg       0.64      0.61      0.56      1654

Evaluation du jeu de test:
               precision    recall  f1-score   support

    negative       0.77      0.12      0.21       559
     neutral       0.55      0.94      0.69      1513
    positive       0.83      0.48      0.61      1475

    accuracy                           0.62      3547
   macro avg       0.72      0.52      0.50      3547
weighted avg       0.70      0.62      0.58      3547

