#Data treatment

## Libraries

In [12]:
%pip install gensim



In [13]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from gensim.models import Word2Vec


## Raw data scraping, mining, extracting

### Data scraping

### Data mining

### Data extracting

### Pure data initialization

In [14]:
# Data
data = {
    'text': ["This is a positive review.", "This is a negative review.", "Another positive example.", "Another negative example."],
    'sentiment': [1, 0, 1, 0] # 1 for positive, 0 for negative
}
df = pd.DataFrame(data)


## Tokenization

In [15]:
# Tokenization
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])


## Padding sequences

In [16]:
# Padding sequences
lengths = []
for x in sequences:
  lengths.append(len(x))

max_length = max(lengths)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')


## Embedding

In [17]:

sentences = []
for text in df['text']:
  sentences.append(text.split())

word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)



embedding_matrix = np.zeros((1000, 100)) # 1000 is num_words, 100 is vector_size
for i in range(len(tokenizer.word_index.items())):
    word = (list(tokenizer.word_index.items()))[i]
    if i < 1000:
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]



# Model

In [18]:
# LSTM
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=100, input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid')) # Using sigmoid for binary classification

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Labels
labels = np.array(df['sentiment'])

model.fit(padded_sequences, labels, epochs=10, verbose=1)



Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.5000 - loss: 0.6931
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.5000 - loss: 0.6931
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53

<keras.src.callbacks.history.History at 0x781c8c0eda00>

# Prediction : evaluate, test, ...

In [19]:
# Prediction
new_text = ["This is a great movie!"]
new_sequence = tokenizer.texts_to_sequences(new_text)
new_padded_sequence = pad_sequences(new_sequence, maxlen=max_length, padding='post', truncating='post')
prediction = model.predict(new_padded_sequence)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step
[[0.5]]


# Model Web implementation

## Download model (.keras, .h5)

In [20]:
model.save("model.keras")

## Implementation Frontend

### Tensorflow Javascript

In [None]:
"""
HTML/CSS/JAVASCRIPT
"""

## Implementation Backend

### Implementation Node.js (javascript's backend)

### Implementation cURL (PHP terminal)

### Implementation ...

# Model Mobile App implementation