In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read from HDFS") \
    .getOrCreate()

data = spark.read.text("hdfs://localhost:9000/user/Hadoop/twit2/*")
data.show()


+--------------------+
|               value|
+--------------------+
|      "File \u001...|
|      "File \u001...|
|      "File \u001...|
|      "File \u001...|
|      "traceback": [|
|      "\u001b[1;3...|
|      "\u001b[1;3...|
|      "Cell \u001...|
|      "File \u001...|
|      "Cell \u001...|
|      "File \u001...|
|10951,TomClancysG...|
|10951,TomClancysG...|
|10951,TomClancysG...|
|10952,TomClancysG...|
|10952,TomClancysG...|
|10952,TomClancysG...|
|3333,Facebook,Irr...|
|3334,Facebook,Irr...|
|3334,Facebook,Irr...|
+--------------------+
only showing top 20 rows



In [2]:
from pyspark.sql.functions import col
filtered_data = data.filter(~data["value"].startswith('   ') )
filtered_data.show()

+--------------------+
|               value|
+--------------------+
|10951,TomClancysG...|
|10951,TomClancysG...|
|10951,TomClancysG...|
|10952,TomClancysG...|
|10952,TomClancysG...|
|10952,TomClancysG...|
|3333,Facebook,Irr...|
|3334,Facebook,Irr...|
|3334,Facebook,Irr...|
|3334,Facebook,Irr...|
|3334,Facebook,Irr...|
|3334,Facebook,Irr...|
|3334,Facebook,Irr...|
|3335,Facebook,Irr...|
|3335,Facebook,Irr...|
|3335,Facebook,Irr...|
|286,Amazon,Neutra...|
|286,Amazon,Neutra...|
|286,Amazon,Neutra...|
|286,Amazon,Neutra...|
+--------------------+
only showing top 20 rows



In [3]:
from pyspark.sql.functions import split

split_col = split(filtered_data['value'], ',')

filtered_data = filtered_data.withColumn('id', split_col.getItem(0)) \
                             .withColumn('topic', split_col.getItem(1)) \
                             .withColumn('sentiment', split_col.getItem(2)) \
                             .withColumn('content', split_col.getItem(3))

filtered_data = filtered_data.drop('value')


In [4]:
filtered_data.show()

+-----+--------------------+----------+--------------------+
|   id|               topic| sentiment|             content|
+-----+--------------------+----------+--------------------+
|10951|TomClancysGhostRecon|  Negative|@AskPS_UK  @Ubiso...|
|10951|TomClancysGhostRecon|  Negative|@AskPS_UK 3 @Ubis...|
|10951|TomClancysGhostRecon|  Negative|@AskPS_UK @Ubisof...|
|10952|TomClancysGhostRecon|   Neutral|The Terminator ev...|
|10952|TomClancysGhostRecon|   Neutral|The Terminator ev...|
|10952|TomClancysGhostRecon|   Neutral|"The event dedica...|
| 3333|            Facebook|Irrelevant|Creepy geek not o...|
| 3334|            Facebook|Irrelevant|Halloween died th...|
| 3334|            Facebook|Irrelevant|"Halloween died t...|
| 3334|            Facebook|Irrelevant|"Halloween died t...|
| 3334|            Facebook|Irrelevant|Halloween died th...|
| 3334|            Facebook|Irrelevant|Halloween died th...|
| 3334|            Facebook|Irrelevant|Halloween died th...|
| 3335|            Faceb

## Data Preprocessing

In [5]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, StringIndexer, NGram
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [6]:
filtered_data = filtered_data.na.drop()




In [7]:
# Tokenize the 'content' column
tokenizer = Tokenizer().setInputCol('content').setOutputCol('words')

# Remove stopwords
stopwords = StopWordsRemover().getStopWords() + ['-']
remover = StopWordsRemover().setStopWords(stopwords).setInputCol('words').setOutputCol('filtered')

# Create bigrams
bigram = NGram().setN(2).setInputCol('filtered').setOutputCol('bigrams')

# Generate features using CountVectorizer
cvmodel = CountVectorizer().setInputCol('filtered').setOutputCol('features')
cvmodel_ngram = CountVectorizer().setInputCol('bigrams').setOutputCol('features')

# Convert 'sentiment' column to a binary label
indexer = StringIndexer().setInputCol('sentiment').setOutputCol('label')


In [8]:
from pyspark.ml import Pipeline

# Define the pipeline stages
pipeline_preprocess = Pipeline(stages=[tokenizer, remover, bigram, cvmodel, indexer])

# Fit the pipeline to your data DataFrame
preprocessed_model = pipeline_preprocess.fit(filtered_data)

# Transform the data DataFrame
preprocessed_data = preprocessed_model.transform(filtered_data)

# Show the first 5 rows of the preprocessed DataFrame
preprocessed_data.show(5)


+-----+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|   id|               topic|sentiment|             content|               words|            filtered|             bigrams|            features|label|
+-----+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|10951|TomClancysGhostRecon| Negative|@AskPS_UK  @Ubiso...|[@askps_uk, , @ub...|[@askps_uk, , @ub...|[@askps_uk ,  @ub...|(57034,[3,66,168,...|  0.0|
|10951|TomClancysGhostRecon| Negative|@AskPS_UK 3 @Ubis...|[@askps_uk, 3, @u...|[@askps_uk, 3, @u...|[@askps_uk 3, 3 @...|(57034,[29,66,110...|  0.0|
|10951|TomClancysGhostRecon| Negative|@AskPS_UK @Ubisof...|[@askps_uk, @ubis...|[@askps_uk, @ubis...|[@askps_uk @ubiso...|(57034,[66,168,63...|  0.0|
|10952|TomClancysGhostRecon|  Neutral|The Terminator ev...|[the, terminator,...| [terminator, event]

## Modeling

In [11]:
trainingData, testData  = filtered_data.randomSplit([0.7, 0.3])

In [13]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline

nb = NaiveBayes(smoothing=1)
pipeline_nb = Pipeline(stages=[tokenizer, remover, cvmodel, indexer, nb])
model_nb = pipeline_nb.fit(trainingData)
predictions_nb = model_nb.transform(testData)

predictions_nb.select('id', 'topic', 'sentiment', 'content', 'prediction').show(5, truncate=False)


+-----+---------------------------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|id   |topic                            |sentiment |content                                                                                                                                                                                                                                                                                                                                              |prediction|
+-----+---------------------------------+----------+----------------------------------------------------------------------------------------------------------------------------------------

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")


accuracy = evaluator.evaluate(predictions_nb)
print("Accuracy:", accuracy)

Accuracy: 0.7375061299095003


In [14]:
model_nb.write().overwrite().save('C:/Users/hanane/Desktop/BigDataProje/apacheflume/ModelSave/')


In [28]:
from pyspark.ml.classification import LogisticRegression

log_reg = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
pipeline = Pipeline(stages = [tokenizer, remover, cvmodel, indexer, log_reg])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
AUC = evaluator.evaluate(predictions)
print(AUC)


0.3017811930884378


In [30]:
from pyspark.ml.classification import  RandomForestClassifier

rf = RandomForestClassifier().setLabelCol('label').setFeaturesCol('features').setNumTrees(10)
pipeline = Pipeline(stages = [tokenizer, remover, cvmodel, indexer, rf])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
AUC = evaluator.evaluate(predictions)
print(AUC)


0.33940389996890685
