In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import time
import re

import pickle

# pre-traitement du text
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

# pour le modèle simple
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# pour le modèle LSTM

# Deep learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer


from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences


from google.colab import drive
drive.mount('/content/drive')

pd.set_option('display.width', 1000)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Nouvelle section

In [None]:
# téléchargement des bases de caractères
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
stop = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# regex permettant d'ignorer les caractères spéciaux ainsi que les nombres et les mots contenant des underscores

def preprocess(text) :

    def tokenize(text):
        tokenizer = nltk.RegexpTokenizer(r'\b(?![\w_]*_)[^\d\W]+\b')
        # Tokenisation de la description et suppression des majuscules
        tokens = tokenizer.tokenize(text.lower())
        return tokens

    def lemmatize_word(text):

        lemmatizer = WordNetLemmatizer()
        lemma = [lemmatizer.lemmatize(token) for token in text]
        return lemma

    def combine_text(list_of_text):

        combined_text = ' '.join(list_of_text)
        return combined_text

    token = tokenize(text)
    stop_removed = [token for token in token if token not in stop]
    lemma = lemmatize_word(stop_removed)
    combined = combine_text(lemma)

    return  combined

In [None]:
text = "Very disappointing, bad restaurant !"

print(preprocess(text))

disappointing bad restaurant


In [None]:
MAX_SEQUENCE_LENGTH =30

In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/PROJET07/tokenizer_lstm.pickle", "rb") as file:
    tokenizer = pickle.load(file)

In [None]:

def predict_sentiment(text):

    # First let's preprocess the text in the same way than for the training
    text = preprocess(text)

    # Let's get the index sequences from the tokenizer
    index_sequence = pad_sequences(tokenizer.texts_to_sequences([text]),
                                   maxlen = MAX_SEQUENCE_LENGTH)

    probability_score = clf_model.predict(index_sequence)[0][0]

    if probability_score < 0.6:
        sentiment = "negative"
    else:
        sentiment = "positive"

    return sentiment, probability_score


## Chargement du modèle

In [None]:
clf_model = load_model('/content/drive/MyDrive/Colab Notebooks/PROJET07/model_lstm_glove.h5')

In [None]:
clf_model.summary()

Model: "model_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 30)]              0         
                                                                 
 embedding_13 (Embedding)    (None, 30, 300)           9090600   
                                                                 
 spatial_dropout1d_13 (Spat  (None, 30, 300)           0         
 ialDropout1D)                                                   
                                                                 
 conv1d_13 (Conv1D)          (None, 26, 64)            96064     
                                                                 
 bidirectional_13 (Bidirect  (None, 26, 256)           197632    
 ional)                                                          
                                                                 
 global_average_pooling1d_6  (None, 256)               0  

In [None]:
text1 = "lost my phone, great !"
text2 = "What an awesome experience, really loved it !"

print("[INFO] : Results for text1 : ", predict_sentiment(text=text1))
print("[INFO] : Results for text2 : ", predict_sentiment(text=text2))

[INFO] : Results for text1 :  ('negative', 0.4838816)
[INFO] : Results for text2 :  ('positive', 0.685951)


In [None]:
text1 = "Very disappointing, bad restaurant !"
text2 = "bad !"

print("[INFO] : Results for text1 : ", predict_sentiment(text=text1))
print("[INFO] : Results for text2 : ", predict_sentiment(text=text2))

[INFO] : Results for text1 :  ('negative', 0.5231273)
[INFO] : Results for text2 :  ('negative', 0.5832253)


# API

In [None]:
from flask import Flask

In [None]:
app = Flask(__name__)

# This is a route to a page of your API
@app.route("/")
def hello():
    return "Hello, World!"

# Here we launch the app
app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
