# <font color="green">  1. Imports  </font>

In [None]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# <font color="blue">  2. Data  </font>

In [None]:
df = pd.read_csv('data/Emotion_final_bronze.csv')

# <font color="red">  3. Analyse </font>

In [None]:
df

In [None]:
df.Emotion.unique()

In [None]:
counts = df['Emotion'].value_counts()
print(counts)



In [None]:
counts = df['Emotion'].value_counts()

plt.bar(counts.index, counts.values)
plt.xlabel('Émotion')
plt.ylabel('Nombre de textes')
plt.title('Répartition des textes par émotions')
plt.show()


# <font color="red">  4. Stopwords  </font>

In [None]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
len(stopwords)

In [None]:


nlp = spacy.load("en_core_web_sm")

# Obtention des anciens stopwords
old_stopwords = nlp.Defaults.stop_words

# Création des nouveaux stopwords en supprimant les apostrophes
new_stopwords = {word.replace("'", "") for word in old_stopwords}

# autre
autre = {',', 't', 'feel', 'feeling', '`', 'little', 'bit'}
# Union des anciens et nouveaux stopwords
all_stopwords = old_stopwords.union(new_stopwords, autre)

print(all_stopwords)


# <font color="green">  5. Mots & sentiments  </font>

In [None]:
sentiment_groups = df.groupby('Emotion')

In [None]:
word_counts = {}

for sentiment, group in sentiment_groups:
    sentiment_words = []
    
    for text in group['Text']:
        doc = nlp(text)
        sentiment_words.extend([token.text.lower() for token in doc if token.text.lower() not in all_stopwords])
        
    word_counts[sentiment] = sentiment_words


In [None]:
from collections import Counter

top_n = 30

top_words_per_sentiment = {}

for sentiment, words in word_counts.items():
    word_counter = Counter(words)
    top_words = [word for word, count in word_counter.most_common(top_n)]
    top_words_per_sentiment[sentiment] = top_words


In [None]:
top_words_per_sentiment

 # <font color="green">  5. Corrélations Sentiments  </font>

In [None]:
sentiment_labels = list(top_words_per_sentiment.keys())

In [None]:

num_sentiments = len(top_words_per_sentiment)
similarity_matrix = np.zeros((num_sentiments, num_sentiments))

# Création d'un corpus de textes pour chaque sentiment
corpus = [' '.join(top_words_per_sentiment[sentiment]) for sentiment in sentiment_labels]

# Création d'un vectoriseur pour compter les fréquences des mots
vectorizer = CountVectorizer()

# Transformation du corpus en une matrice de fréquences des mots
X = vectorizer.fit_transform(corpus)

# Calcul de la similarité cosinus entre chaque paire de sentiments
similarity_matrix = cosine_similarity(X)

# Affichage de la matrice de similarité sous forme de heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(similarity_matrix, annot=True, xticklabels=sentiment_labels, yticklabels=sentiment_labels, cmap='coolwarm')
plt.xlabel('Sentiments')
plt.ylabel('Sentiments')
plt.title('Similarity Matrix of Sentiments')
plt.show()


  # <font color="green">  6. Bag of Words & TF-IDF  </font>

In [None]:
# Fonction de prétraitement
def preprocess_text(text):
    # Prétraitement avec Spacy
    doc = nlp(text)
    
    tokens = []
    for token in doc:
        # Ignorer la ponctuation et les espaces
        if not token.is_punct and not token.is_space:
            # Lemmatisation
            lemma = token.lemma_
            tokens.append(lemma)
    
    # Retourner le texte prétraité sous forme d'une chaîne de caractères
    return ' '.join(tokens)

# Prétraitement du DataFrame
df['preprocessed_text'] = df['Text'].apply(preprocess_text)

# Liste des stopwords
stopwords_list = list(old_stopwords) + list(new_stopwords) + list(autre)

# Création de la représentation Bag of Words
vectorizer = CountVectorizer(stop_words = stopwords_list)
bow_representation = vectorizer.fit_transform(df['preprocessed_text'].to_list())

# Création de la représentation TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words = stopwords_list)
tfidf_representation = tfidf_vectorizer.fit_transform(df['preprocessed_text'].to_list())

# Exemple d'utilisation de la représentation Bag of Words
print(bow_representation.toarray())

# Exemple d'utilisation de la représentation TF-IDF
print(tfidf_representation.toarray())

In [None]:
len(old_stopwords)+len(new_stopwords)+len(autre)


In [None]:
len(stopwords_list)

In [None]:
vectorizer

In [None]:
bow_representation

In [None]:
def count_zeros(matrix):
    count = 0
    for row in matrix:
        for element in row:
            if element == 0:
                count += 1
    return count


In [None]:
count_zeros(bow_representation.toarray())


In [None]:
count_zeros(tfidf_representation.toarray())

# <font color="blue"> 7. Modéle </font>   

In [None]:
# Division des données en entraînement et en test
X_train, X_test, y_train, y_test = train_test_split(tfidf_representation, df['Emotion'], test_size=0.2, random_state=42)

# Exemple d'utilisation d'un modèle de classification (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)
predictions = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

In [None]:
# Exemple de prédiction d'une nouvelle phrase
new_phrase = "This movie is great!"
preprocessed_new_phrase = preprocess_text(new_phrase)
tfidf_representation_new_phrase = tfidf_vectorizer.transform([preprocessed_new_phrase])
prediction = svm_model.predict(tfidf_representation_new_phrase)

print("Phrase:", new_phrase)
print("Emotion prédite:", prediction[0]) 