In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
# Download the stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt

# Load the DataFrames
df = pd.read_csv("Data_Projet/Datafiniti_Hotel_Reviews.csv", sep=',', usecols=range(20))

# Replace the missing values in the "reviews.title" column with an empty string
df["reviews.title"] = df["reviews.title"].fillna("")

# Select the columns "name", "reviews.rating", "reviews.text", and "reviews.title"
df = df[["name", "reviews.rating", "reviews.text", "reviews.title"]]

# combine the "reviews.title" and "reviews.text" columns into a single column
df["reviews_combined"] = df["reviews.title"] + " " + df["reviews.text"]

# Sort the DataFrame by the "reviews.rating" column
df = df.sort_values(by="reviews.rating")

# Delete the "reviews.text" and "reviews.title" columns
df.drop(columns=["reviews.text", "reviews.title"], inplace=True)

df["reviews_combined"] = df["reviews_combined"].fillna("")
stop_words = set(stopwords.words("english"))

# Create a WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to remove stopwords and lemmatize the text
def remove_stopwords(text):
    # Tokeniser le texte en mots
    tokens = word_tokenize(text)
    # Retirer les mots qui sont des stopwords
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    # Lemmatisation des mots
    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in filtered_tokens]
    # Rejoindre les tokens lemmatisés en une chaîne de texte
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

# Apply the function to the "reviews_combined" column
df["reviews_combined_no_stopwords"] = df["reviews_combined"].apply(remove_stopwords)

tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,  # Convertit le texte en minuscules
    tokenizer=word_tokenize,  # Tokenise les mots
    stop_words='english',  # Retire les mots vides en anglais
    max_df=0.8,  # Ignore les mots qui apparaissent dans plus de 80% des documents
    min_df=5  # Ignore les mots qui n'apparaissent pas au moins dans 5 documents
)

tfidf_matrix = tfidf_vectorizer.fit_transform(df["reviews_combined_no_stopwords"])

# Select the first document
# index_document = 0

# Extract the TF-IDF values for the first document
# tfidf_values = tfidf_matrix[index_document].toarray()[0]

# Retrieve the terms from the TF-IDF vectorizer
# terms = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame with the terms and their TF-IDF values
# tfidf_df = pd.DataFrame({'Term': terms, 'TF-IDF Value': tfidf_values})

# Sort the DataFrame by the TF-IDF values in descending order
# tfidf_df_sorted = tfidf_df.sort_values(by='TF-IDF Value', ascending=False)

# Select the top 10 terms
# top_terms = tfidf_df_sorted.head(10)

# Create a bar plot of the top 10 terms
# plt.figure(figsize=(10, 6))
# plt.barh(top_terms['Term'], top_terms['TF-IDF Value'])
# plt.xlabel('TF-IDF Value')
# plt.ylabel('Term')
# plt.title('Top 10 TF-IDF Values for Document 1')
# plt.gca().invert_yaxis()
# plt.show()

# Display the shape of the TF-IDF matrix
# nombre_documents, nombre_termes = tfidf_matrix.shape

# Display the shape of the TF-IDF matrix
# print("Taille de la matrice TF-IDF : {} lignes (documents) x {} colonnes (termes)".format(nombre_documents, nombre_termes))

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df["reviews.rating"], test_size=0.2, random_state=42)

# Convert the sparse matrix to a dense NumPy array
X_train_np = X_train.toarray()
y_train_np = np.array(y_train)

# Divide the training data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_np, y_train_np, test_size=0.2, random_state=42)

# Display the shapes of the training, validation, and test sets
print("Taille de l'ensemble d'entraînement :", X_train.shape[0])
print("Taille de l'ensemble de test :", X_test.shape[0])


# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(len(tfidf_vectorizer.vocabulary_),)),  # Couche d'entrée
    tf.keras.layers.Dense(64, activation='relu'),  # Couche cachée
    tf.keras.layers.Dense(1, activation='sigmoid')  # Couche de sortie
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model on the training data 
model.fit(X_train_split, y_train_split, epochs=10, batch_size=32, validation_data=(X_val, y_val))

loss, accuracy = model.evaluate(X_test, y_test)
print("Loss :", loss)
print("Accuracy :", accuracy)

# Predict the ratings for the test set
predictions = model.predict(X_test)
predictions = np.round(predictions)

# Calculate the accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the performance metrics
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

