we are working in a local development environment (not in a distributed Spark cluster) so we should imoprt findspark

# Imports

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover,StringIndexer, CountVectorizer, NGram, VectorAssembler, ChiSqSelector
from pyspark.sql.functions import col, sum as spark_sum
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from xgboost.spark import SparkXGBClassifier

# Variables de contexte

In [2]:
spark= SparkSession.builder.config("spark.storage.memoryFraction", "0.6").appName('Twitter').getOrCreate()

24/05/05 18:26:45 WARN Utils: Your hostname, sasamg-HP-Laptop-15s-eq2xxx resolves to a loopback address: 127.0.1.1; using 192.168.100.63 instead (on interface wlo1)
24/05/05 18:26:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/05 18:26:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
training_path = '../twitter_training.csv'

### we defined a StructType. This allows us when reading the CSV containing the data, to tell Spark to load the data according to the schema defined above.

In [4]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("game", StringType(), True),
    StructField("sentiment", StringType(), True),
    StructField("tweet", StringType(), True)
])

In [5]:
dataset = spark.read.csv(training_path, inferSchema=True, schema = schema)
dataset.printSchema()

root
 |-- id: integer (nullable = true)
 |-- game: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- tweet: string (nullable = true)



In [6]:
dataset.show()

                                                                                

+----+-----------+---------+--------------------+
|  id|       game|sentiment|               tweet|
+----+-----------+---------+--------------------+
|2401|Borderlands| Positive|im getting on bor...|
|2401|Borderlands| Positive|I am coming to th...|
|2401|Borderlands| Positive|im getting on bor...|
|2401|Borderlands| Positive|im coming on bord...|
|2401|Borderlands| Positive|im getting on bor...|
|2401|Borderlands| Positive|im getting into b...|
|2402|Borderlands| Positive|So I spent a few ...|
|2402|Borderlands| Positive|So I spent a coup...|
|2402|Borderlands| Positive|So I spent a few ...|
|2402|Borderlands| Positive|So I spent a few ...|
|2402|Borderlands| Positive|2010 So I spent a...|
|2402|Borderlands| Positive|                 was|
|2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|2403|Borderlands|  Neutral|Rock-Hard La Vita...|
|2403|Borderlands|  Neutral|Live Rock - Hard ...|


In [7]:
dataset.describe().show()

24/05/05 18:26:51 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------+-----------------+---------------+----------+--------------------+
|summary|               id|           game| sentiment|               tweet|
+-------+-----------------+---------------+----------+--------------------+
|  count|            74682|          74682|     74682|               73996|
|   mean|6432.586165341046|           NULL|      NULL|                 3.2|
| stddev|3740.427870177445|           NULL|      NULL|   2.007130147392398|
|    min|                1|         Amazon|Irrelevant|                    |
|    max|            13200|johnson&johnson|  Positive|🧻 at Home Depot ...|
+-------+-----------------+---------------+----------+--------------------+



                                                                                

In [8]:
null_counts = dataset.select(*(spark_sum(col(c).isNull().cast("int")).alias(c) for c in dataset.columns))
null_counts.show()

+---+----+---------+-----+
| id|game|sentiment|tweet|
+---+----+---------+-----+
|  0|   0|        0|  686|
+---+----+---------+-----+



# Text Preprocessing

In [9]:
print("The training dataset contains {} samples.".format(dataset.count()))

The training dataset contains 74682 samples.


In [10]:
dataset = dataset.dropna(subset=["tweet"])

#### The 'tweet' column in our dataset is in string format. Therefore, we cannot directly use it for training. First, we need to tokenize it, which we achieve with a tokenizer. Then, we convert these words into vectors using HashingTF. In the notebook's later part, we will see that CountVectorizer is used instead of this method. These two are completely separate methods, and both can be used. By applying these methods, we prepare our 'text' column for training by applying IDF to it. Finally, we label the target column with StringIndexer and convert it to double.

    1. Tokinizer : text -> words
    2. HashingTF : filtered_words -> tf
    3. IDF : tf -> features
    


In [11]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
cv = CountVectorizer(inputCol="words", outputCol="TF")
idf = IDF(inputCol="TF", outputCol="features")

#### now we will convert these categorical Target (column = sentiment) into numerical indices. we will use StringIndexer for this purpose.
    4. IDF : StringIndexer : target -> label

In [12]:
label = StringIndexer(inputCol="sentiment", outputCol="label", handleInvalid="skip")
lr = LogisticRegression(featuresCol="features", labelCol="label", family="multinomial")

### If we start with logistic regression, we can put all these pre-processing steps in a pipeline to make it easier to run.

In [13]:
pipeline = Pipeline(stages=[tokenizer, cv, idf, label, lr])

## Chargement du dataset et séparation train/test

In [14]:
train_set, test_set = dataset.randomSplit([0.8, 0.2], seed=42)

# Modeling
### Logistic Regression model¶

In [15]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
evaluator_weighted_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
evaluator_weighted_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0, 10.0]) \
    .addGrid(lr.maxIter, [20, 50, 100, 500, 1000]) \
    .addGrid(lr.elasticNetParam, [0, 0.5, 1.0]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_accuracy,
                          numFolds=10) 

cvModel = crossval.fit(train_set)
predictions = cvModel.transform(test_set)

24/05/05 18:27:03 WARN DAGScheduler: Broadcasting large task binary with size 1873.0 KiB
24/05/05 18:27:04 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:05 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:06 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
24/05/05 18:27:06 WAR

In [None]:
accuracy_lr = evaluator_accuracy.evaluate(predictions)
f1_score_lr = evaluator_f1.evaluate(predictions)
weighted_precision_lr = evaluator_weighted_precision.evaluate(predictions)
weighted_recall_lr = evaluator_weighted_recall.evaluate(predictions)

In [None]:
print("Accuracy:", accuracy_lr)
print("F1 Score:", f1_score_lr)
print("Weighted Precision:", weighted_precision_lr)
print("Weighted Recall:", weighted_recall_lr)

In [None]:
pipeline_model.bestModel.save("Pretrained_LogisticRegression.pkl")