In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=67b54beed13b3cce06bd7e3947fec3ed59dae25eb4a3ae292df05802a06fc0b9
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *

In [None]:
sc =SparkContext()
sqlContext = SQLContext(sc)



In [None]:
customSchema = StructType([
    StructField("clean_text", StringType()),
    StructField("category", StringType())])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
filename = '/content/drive/MyDrive/QTDL/Report/Sentiment-Analysis-using-Pyspark-on-Multi-Social-Media-Data/twtr_dataset.csv'


In [None]:
from pyspark.sql.functions import regexp_replace

df = sqlContext.read.format("csv").option("header", "true").schema(customSchema).load(filename)

In [None]:
data = df.na.drop(how='any')
data.show(10)

+--------------------+--------+
|          clean_text|category|
+--------------------+--------+
|pritam das mukerj...|      -1|
|’ insult chowkida...|      -1|
|modi big ineffici...|       1|
|mega analysis yea...|       0|
|modi isn’ going v...|       1|
|why only bjp mall...|      -1|
|there’ looking ba...|       1|
|scam after scam h...|       0|
|just seven days b...|       1|
|making big promis...|       1|
+--------------------+--------+
only showing top 10 rows



Model Pipeline

In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="clean_text", outputCol="words", pattern="\\W")

# stop words
stop_words = list(stopwords.words('english'))
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

In [None]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset = dataset.dropDuplicates(['clean_text'])

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
logistic_regression = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        family='multinomial',
                        maxIter=20,
                        regParam=0.3,
                        elasticNetParam=0)

lrModel = logistic_regression.fit(trainingData)


In [None]:
dataset.select("clean_text", "words").show(truncate=False, n = 2)

+---------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|clean_text                                                                 |words                                                                                |
+---------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
|    "                                                                      |[]                                                                                   |
|  all the best dear brother goddess chamundeshwari bless you vote for modi |[all, the, best, dear, brother, goddess, chamundeshwari, bless, you, vote, for, modi]|
+---------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
only showing top

In [None]:
dataset.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|cl

In [None]:
dataset.select("rawFeatures", "features").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|rawFeatures                                                                                                                                                                         |features                                                                                                                                                                                                                                                        

In [None]:
testData.show()

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+-----+
|          clean_text|category|               words|            filtered|         rawFeatures|            features|label|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+-----+
|  — feeling inspired|       0| [feeling, inspired]| [feeling, inspired]|(262144,[137733,2...|(262144,[137733,2...|  1.0|
| are you living w...|       0|[are, you, living...|[living, wonder, ...|(262144,[29066,40...|(262144,[29066,40...|  1.0|
| astrologer from ...|       1|[astrologer, from...|[astrologer, patn...|(262144,[3659,409...|(262144,[3659,409...|  0.0|
| between what was...|      -1|[between, what, w...|[asked, responses...|(262144,[43237,46...|(262144,[43237,46...|  2.0|
|    bhaunke hazzar "|       0|   [bhaunke, hazzar]|   [bhaunke, hazzar]|(262144,[27237,34...|(262144,[27237,34...|  1.0|
| chinas antisatel...|  

In [None]:
predictions = lrModel.transform(testData)
predictions.select("rawPrediction", "probability", "prediction").show(truncate=False, n=10)

+--------------------------------------------------------------+-------------------------------------------------------------+----------+
|rawPrediction                                                 |probability                                                  |prediction|
+--------------------------------------------------------------+-------------------------------------------------------------+----------+
|[0.07771163316623936,0.651264258474753,-0.7289758916409924]   |[0.3104727772619628,0.5509526379715356,0.13857458476650172]  |1.0       |
|[-0.06243799760335658,0.4296136744027645,-0.36717567679940843]|[0.2964732668681396,0.48493202382155715,0.21859470931030342] |1.0       |
|[1.136288026992876,0.07354616633267475,-1.2098341933255512]   |[0.6938436535795051,0.23972796263578877,0.0664283837847062]  |0.0       |
|[0.9830347643272623,0.03206941812880182,-1.0151041824560645]  |[0.6570495426607753,0.2538628287476741,0.08908762859155069]  |0.0       |
|[-0.02774965318588185,0.719683161

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7545254540452316

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(logistic_regression.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(logistic_regression.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=logistic_regression, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# Naive Bayes

In [None]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("clean_text","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                    clean_text|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|thanks for making india gre...|       1|[1.0,9.855980053657911E-17,...|  0.0|       0.0|
|                    wonderful |       1|[1.0,9.828265719978505E-17,...|  0.0|       0.0|
|the author famous for disto...|       1|[1.0,9.689430390831281E-17,...|  0.0|       0.0|
|accept that there are raga ...|       1|[1.0,9.641584415362583E-17,...|  0.0|       0.0|
|chowkidar transfers officer...|       1|[1.0,9.576375907228116E-17,...|  0.0|       0.0|
|chowkidar transfers officer...|       1|[1.0,9.576375907228116E-17,...|  0.0|       0.0|
|proud have you our karta dh...|       1|[1.0,9.371160145449996E-17,...|  0.0|       0.0|
|this persons family has gav...|       1|[1.0,8.265387997319024E-17,...|  0.0|       0.0|
|good fort

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6411198282277482

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create initial Naïve Bayes model
nb = NaiveBayes(labelCol="label", featuresCol="features")

# Create ParamGrid for Cross Validation
nbparamGrid = (ParamGridBuilder()
               .addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
               .build())

# Evaluate model
nbevaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

# Create 5-fold CrossValidator
nbcv = CrossValidator(estimator = nb,
                    estimatorParamMaps = nbparamGrid,
                    evaluator = nbevaluator,
                    numFolds = 5)

# Run cross validations
nbcvModel = nbcv.fit(trainingData)
print(nbcvModel)

# Use test set here so we can measure the accuracy of our model on new data
nbpredictions = nbcvModel.transform(testData)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
print('Accuracy:', nbevaluator.evaluate(nbpredictions))

CrossValidatorModel_2f919e320cd5
Accuracy: 0.6696806624304865
