In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark


gzip: stdin: unexpected end of file
tar: Unexpected EOF in archive
tar: Unexpected EOF in archive
tar: Error is not recoverable: exiting now
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install tensorflow
!pip install keras

In [None]:
import time
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"
import findspark
findspark.init()
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Importer les bibliothèques nécessaires
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
#Initialisation de la session Spark
MAX_MEMORY = "8g"
spark = SparkSession \
.builder \
.appName("TP_BDM_LSTM") \
.config("spark.executor.memory", MAX_MEMORY) \
.config("spark.driver.memory", MAX_MEMORY) \
.getOrCreate()

In [None]:
Start = time.time()
# Charger le dataset
%cd /content/drive/MyDrive/
df = spark.read.csv("sentiment140.csv", header=None)
# Renommer les colonnes
df = df.withColumnRenamed("_c0", "target").withColumnRenamed("_c5", "tweet")
# Convertir les valeurs de la colonne target en 0 et 1
df = df.withColumn("target", col("target") / 4)
# Convertir le type de la colonne "target" en entier
df = df.withColumn("target", df["target"].cast("integer"))

In [None]:
#Affichage des données
df.show()
# Diviser les données en ensembles d'apprentissage et de test
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
# Convertir les données Spark DataFrame en pandas DataFrame pour le prétraitement avec Keras
train_data_pd = train_data.toPandas()
test_data_pd = test_data.toPandas()

# Tokeniser les tweets
tokenizer = Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
    char_level=False,
    oov_token=None,
    analyzer=None)
tokenizer.fit_on_texts(train_data_pd["tweet"])
train_sequences = tokenizer.texts_to_sequences(train_data_pd["tweet"])
test_sequences = tokenizer.texts_to_sequences(test_data_pd["tweet"])

# Remplir les séquences pour qu'elles aient la même longueur
max_len = 100 # définir la longueur maximale des séquences
train_data = pad_sequences(train_sequences, maxlen=max_len, padding="post")
test_data = pad_sequences(test_sequences, maxlen=max_len, padding="post")

# Définir l'architecture du modèle LSTM
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=max_len))
model.add(LSTM(64, return_sequences=True))
model.add(Dense(1, activation="sigmoid"))

# Compiler le modèle
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Entraîner le modèle
model.fit(train_data, train_data_pd["target"], epochs=5, batch_size=32)

+------+----------+--------------------+--------+---------------+--------------------+
|target|       _c1|                 _c2|     _c3|            _c4|               tweet|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|     0|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|     0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|     0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|     0|1467811795|Mon Apr 06 22:20:...|NO_

In [None]:
# Évaluer le modèle
scores = model.evaluate(test_data, test_data_pd["target"], verbose=1)
print("Test Loss:", scores[0])
print("Test Accuracy:", scores[1])
end = time.time()
print(end-Start)

Test Loss: 0.47679203748703003
Test Accuracy: 0.8003516793251038
83.71328663825989
