In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np


# [Problème 1] Implémentation Scratch de BoW

In [None]:
sentences = [
  "This movie is SOOOO funny!!!",
  "What a movie! I never",
  "best movie ever!!!!! this movie"
]

def tokenize(sentence):
  sentence = re.sub(r'[^\w\s]', '', sentence).lower()
  return sentence.split()


def generate_ngrams(tokens, n):
  return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]


def build_vocabulary(sentences, n):
  vocabulary = set()
  for sentence in sentences:
    tokens = tokenize(sentence)
    ngrams = generate_ngrams(tokens, n)
    vocabulary.update(ngrams)
  return sorted(list(vocabulary))


def sentence_to_bow(sentence, vocabulary, n):
  tokens = tokenize(sentence)
  ngrams = generate_ngrams(tokens, n)
  bow = Counter(ngrams)
  return [bow[term] for term in vocabulary]


def bow_representation(sentences, n):
  vocabulary = build_vocabulary(sentences, n)
  print(f"Vocabulary ({n}-gram): {vocabulary}")
  bows = []
  for sentence in sentences:
    bows.append(sentence_to_bow(sentence, vocabulary, n))
  return bows

bow_1gram = bow_representation(sentences, 1)
bow_2gram = bow_representation(sentences, 2)

print("\nBoW 1-gram Representation:")
for i, bow in enumerate(bow_1gram):
  print(f"Sentence {i+1}: {bow}")

print("\nBoW 2-gram Representation:")
for i, bow in enumerate(bow_2gram):
  print(f"Sentence {i+1}: {bow}")


# [Problème 2] Calcul TF-IDF

In [None]:
from sklearn.datasets import load_files

train_review = load_files('./aclImdb/train/', encoding='utf-8')
x_train, y_train = train_review.data, train_review.target

test_review = load_files('./aclImdb/test/', encoding='utf-8')
x_test, y_test = test_review.data, test_review.target


print(train_review.target_names)

In [None]:
nltk.download('stopwords')

reviews = x_train

stop_words = list(stopwords.words('english'))

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words=stop_words)

tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)

tfidf_dense = tfidf_matrix.todense()

print(tfidf_dense)
print(tfidf_vectorizer.get_feature_names_out())


# [Problème 3] Apprendre avec TF-IDF

In [None]:
nltk.download('stopwords')

stop_words = list(stopwords.words('english'))


tfidf_vectorizer = TfidfVectorizer(
  max_features=5000, 
  stop_words=stop_words, 
  ngram_range=(1, 2)
)


x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)


model = LogisticRegression(max_iter=200)
model.fit(x_train_tfidf, y_train)


y_pred = model.predict(x_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Précision : {accuracy:.4f}")

print(tfidf_vectorizer.get_feature_names_out())


# [Problème 4] Implémentation Scratch de TF-IDF

In [None]:
import math

documents = [
  "This movie is SOOOO funny!!!",
  "What a movie! I never",
  "best movie ever!!!!! this movie"
]


def tokenize(doc):
  return doc.lower().split()


def compute_tf(doc):
  tf = Counter(tokenize(doc))
  doc_len = len(tokenize(doc))
  return {term: count / doc_len for term, count in tf.items()}


def compute_idf_standard(docs):
  N = len(docs)
  idf = {}
  all_terms = list(term for doc in docs for term in tokenize(doc))

  for term in all_terms:
    df = sum(1 for doc in docs if term in tokenize(doc))
    idf[term] = math.log(N / df)

  return idf

def compute_idf_sklearn(docs):
  N = len(docs)
  idf = {}
  all_terms = list(term for doc in docs for term in tokenize(doc))

  for term in all_terms:
    df = sum(1 for doc in docs if term in tokenize(doc))
    idf[term] = math.log(N / (1 + df)) + 1
  return idf


def compute_tf_idf(doc, docs, idf):
  tf = compute_tf(doc)
  return {term: tf_val * idf[term] for term, tf_val in tf.items()}


def compute_tf_idf_all(docs, idf_function):
  idf = idf_function(docs)
  tf_idf_docs = []

  for doc in docs:
    tf_idf = compute_tf_idf(doc, docs, idf)
    tf_idf_docs.append(tf_idf)

  return tf_idf_docs


tf_idf_standard = compute_tf_idf_all(documents, compute_idf_standard)

tf_idf_sklearn = compute_tf_idf_all(documents, compute_idf_sklearn)

print("TF-IDF (Standard):")
for i, doc in enumerate(tf_idf_standard):
  print(f"Document {i+1}: {doc}")

print("\nTF-IDF (scikit-learn):")
for i, doc in enumerate(tf_idf_sklearn):
  print(f"Document {i+1}: {doc}")


# [Problème 5] Prétraitement du corpus

In [None]:
nltk.download('stopwords')

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'http\S+', '', text)  
  text = re.sub(r'[^a-z\s]', '', text)  
  tokens = text.split()  
  return tokens

x_train_processed = [preprocess_text(review) for review in x_train]


# [Problème 6] Apprentissage de Word2Vec

In [16]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(
  sentences=x_train_processed,
  vector_size=100,
  window=5,
  min_count=2,
  sg=0,  
  workers=4
)

word2vec_model.train(x_train_processed, total_examples=len(x_train_processed), epochs=10)


# Sauvegarde du modèle
word2vec_model.save("word2vec_imdb.model")

# [Problème 7] (Problème avancé) Visualisation vectorielle

In [18]:
from sklearn.manifold import TSNE


word_vectors = word2vec_model.wv.vectors
words = word2vec_model.wv.index_to_key

tsne = TSNE(n_components=2, random_state=42)
word_vectors_2d = tsne.fit_transform(word_vectors)

plt.figure(figsize=(10, 8))
plt.scatter(word_vectors_2d[:, 0], word_vectors_2d[:, 1])

selected_words = ['good', 'bad', 'movie', 'love', 'happy', 'sad']  
for i, word in enumerate(words):
  if word in selected_words:
    plt.annotate(word, xy=(word_vectors_2d[i, 0], word_vectors_2d[i, 1]))


In [None]:
similar_words = word2vec_model.wv.most_similar('movie', topn=5)
print(similar_words)


# [Problème 8] (Problème avancé) Classification des revues de films à l'aide de Word2Vec

In [None]:
def get_average_vector(tokens, model):
  vectors = [model.wv[token] for token in tokens if token in model.wv]
  if vectors:
    return np.mean(vectors, axis=0)
  else:
    return np.zeros(model.vector_size)

x_train_vectors = np.array([get_average_vector(review, word2vec_model) for review in x_train_processed])
x_test_vectors = np.array([get_average_vector(preprocess_text(review), word2vec_model) for review in x_test])

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

classifier = LogisticRegression(max_iter=200)
classifier.fit(x_train_vectors, y_train_encoded)

y_pred = classifier.predict(x_test_vectors)

accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Précision : {accuracy:.4f}")
