# Question 1
Corona NLP Dataset

In [1]:
# used to create DataFrame
from pyspark.sql import SparkSession 

In [2]:
#to construct SparkSession instances with app name corona
spark = SparkSession.builder.appName('corona').getOrCreate() 

In [3]:
#Reading the csv file 
data=spark.read.csv('Corona_NLP1.csv', header = True, inferSchema=True,sep= ',')

In [4]:
#viewing the dataset
data.show()

+--------------------+--------------------+--------------------+--------------------+----------+--------------------+
|            UserName|          ScreenName|            Location|           Sentiment|   TweetAt|       OriginalTweet|
+--------------------+--------------------+--------------------+--------------------+----------+--------------------+
|                3799|               48751|              London|             Neutral|16-03-2020|@MeNyrbie @Phil_G...|
|                3800|               48752|                  UK|            Positive|16-03-2020|advice Talk to yo...|
|                3801|               48753|           Vagabonds|            Positive|16-03-2020|Coronavirus Austr...|
|                3802|               48754|                null|            Positive|16-03-2020|My food stock is ...|
|              PLEASE|         don't panic| THERE WILL BE EN...|                null|      null|                null|
|           Stay calm|          stay safe.|             

In [5]:
#to print out schema in tree format
data.printSchema()

root
 |-- UserName: string (nullable = true)
 |-- ScreenName: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Sentiment: string (nullable = true)
 |-- TweetAt: string (nullable = true)
 |-- OriginalTweet: string (nullable = true)



In [6]:
#listing down the columns of the data
data.columns

['UserName', 'ScreenName', 'Location', 'Sentiment', 'TweetAt', 'OriginalTweet']

In [7]:
#dropping the duplicate values
data = data.dropDuplicates()
print(data.count())

65074


In [8]:
#dropping null values
data = data.na.drop()
print(data.count())

32621


In [9]:
#defining the list of different sentiments
sentiments = ['Positive','Negative','Neutral','Extremely Positive','Extremely Negative']

In [10]:
#filtering out data in the dataset
data = data.filter(data.Sentiment.isin(sentiments))

In [11]:
#counting the distinct sentiments in the dataset
data.select('Sentiment').distinct().count()
#there are 5 distinct sentiments

5

In [12]:
#showing the distinct sentiments in the dataset
data.select('Sentiment').distinct().show()

+------------------+
|         Sentiment|
+------------------+
|Extremely Negative|
|           Neutral|
|          Positive|
|          Negative|
|Extremely Positive|
+------------------+



# Data Preparation

In [13]:
#Computes the character length of string data or number of bytes of binary data
from pyspark.sql.functions import length

In [14]:
#Viewing the length of each and every tweet
data=data.withColumn('length', length(data['OriginalTweet']))

In [15]:
#Viewing the length of each and every tweet
data.show()

+--------+----------+--------------------+------------------+----------+--------------------+------+
|UserName|ScreenName|            Location|         Sentiment|   TweetAt|       OriginalTweet|length|
+--------+----------+--------------------+------------------+----------+--------------------+------+
|    3926|     48878| ????? ???? ????????|          Negative|16-03-2020|#unpopularopinion...|   175|
|    4155|     49107|      Owensboro, KY |           Neutral|16-03-2020|Just online shopp...|    80|
|    4247|     49199|            New York|          Positive|16-03-2020|I know a lot of g...|   269|
|    4949|     49901|         Houston, TX|          Positive|17-03-2020|Our latest issue ...|   164|
|    5065|     50017|  Manchester, Europe|Extremely Positive|17-03-2020|If you are health...|   202|
|    5322|     50274|      Leeds, England|          Positive|17-03-2020|#COVID2019 local ...|   191|
|    5766|     50718|          upstate NY|          Negative|17-03-2020|Seeing those empt..

In [16]:
#renaming the column
data=data.withColumnRenamed("Sentiment","sentiment")

In [17]:
#calculating the mean length of different sentiments
data.groupby('Sentiment').mean().show()

+------------------+------------------+
|         Sentiment|       avg(length)|
+------------------+------------------+
|Extremely Negative|179.08476571697668|
|           Neutral|134.06076810889644|
|          Positive| 167.5731693929081|
|          Negative|165.74478227261014|
|Extremely Positive|183.49146433990896|
+------------------+------------------+



In [18]:
#Performing tokenization to divide the entire corpus of text into vords, removing the stop words which are 
#basically unnecessary and irrelevant, using count vectorizer, inverse document frequency and string indexer to
#transform the text
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

tokenizer=Tokenizer(inputCol="OriginalTweet", outputCol="token_text")
stopremove=StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
count_vec=CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
idf=IDF(inputCol="c_vec", outputCol="tf_idf")

# convert the labels in numbers
label_to_num = StringIndexer(inputCol="sentiment", outputCol='label')

In [19]:
#feature transformer that merges multiple columns into a vector column.
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [20]:
cleaned = VectorAssembler(inputCols=['tf_idf','length'], outputCol='features')

# Model Building

In [21]:
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, DecisionTreeClassifier
#instantitating naivesbayes, randomforest classifier and decision tree classifier
nb=NaiveBayes()
rf=RandomForestClassifier()
dtc=DecisionTreeClassifier()

# Pipeline

In [22]:
from pyspark.ml import Pipeline
#building a pieline for pre-processing the text 
data_prep_pipeline= Pipeline(stages=[label_to_num, tokenizer, stopremove,count_vec, idf,cleaned])

In [23]:
#fitting the model on the data
cleaned_data= data_prep_pipeline.fit(data)

In [24]:
#transforming the data
cleaned_data=cleaned_data.transform(data)

In [25]:
#viewing the cleaned data 
cleaned_data.show()

+--------+----------+--------------------+------------------+----------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|UserName|ScreenName|            Location|         sentiment|   TweetAt|       OriginalTweet|length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+--------+----------+--------------------+------------------+----------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    3926|     48878| ????? ???? ????????|          Negative|16-03-2020|#unpopularopinion...|   175|  1.0|[#unpopularopinio...|[#unpopularopinio...|(80619,[5,56,60,8...|(80619,[5,56,60,8...|(80620,[5,56,60,8...|
|    4155|     49107|      Owensboro, KY |           Neutral|16-03-2020|Just online shopp...|    80|  2.0|[just, online, sh...|[online, shopping...|(806

In [26]:
#extracting only label and features 
cleaned_data=cleaned_data.select(['label', 'features'])

In [27]:
#viewing the selected columns
cleaned_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(80620,[5,56,60,8...|
|  2.0|(80620,[6,13,14,8...|
|  0.0|(80620,[0,3,7,17,...|
|  0.0|(80620,[2,10,15,3...|
|  3.0|(80620,[0,5,6,17,...|
|  0.0|(80620,[5,16,19,4...|
|  1.0|(80620,[3,7,16,56...|
|  0.0|(80620,[38,45,116...|
|  2.0|(80620,[9,59,91,2...|
|  1.0|(80620,[3,7,45,56...|
|  1.0|(80620,[0,7,47,59...|
|  2.0|(80620,[0,3,8,12,...|
|  4.0|(80620,[0,3,7,17,...|
|  0.0|(80620,[0,13,14,1...|
|  2.0|(80620,[27806,506...|
|  3.0|(80620,[1,6,11,36...|
|  2.0|(80620,[6,665,118...|
|  3.0|(80620,[4,6,51,54...|
|  2.0|(80620,[3,7,8,31,...|
|  2.0|(80620,[0,4,21,36...|
+-----+--------------------+
only showing top 20 rows



# ML Training

In [28]:
#Spilting the data into train and test
(training, testing)=cleaned_data.randomSplit([0.7,0.3])

# Naives Bayes

In [29]:
#fitting the model on the dataset
spam_predictor_nb=nb.fit(training)

In [30]:
#transforming the model
test_results_nb=spam_predictor_nb.transform(testing)

In [31]:
#viewing the results of the test data
test_results_nb.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(80620,[0,1,3,7,6...|[-1972.4818762147...|[0.99996698630983...|       0.0|
|  0.0|(80620,[0,2,12,70...|[-2064.7263066475...|[1.13313039964965...|       1.0|
|  0.0|(80620,[0,3,12,21...|[-1365.0355758765...|[0.99990741851604...|       0.0|
|  0.0|(80620,[0,5,8,23,...|[-885.90026777626...|[1.70549320586324...|       1.0|
|  0.0|(80620,[0,9,12,10...|[-2180.2237815626...|[1.62538794124251...|       4.0|
|  0.0|(80620,[0,9,12,12...|[-1455.4884940237...|[0.99999999999958...|       0.0|
|  0.0|(80620,[0,12,24,3...|[-1484.0008205774...|[1.40300473694068...|       3.0|
|  0.0|(80620,[0,13,14,1...|[-289.77611252761...|[0.98033632604683...|       0.0|
|  0.0|(80620,[0,13,46,8...|[-1774.0938598433...|[0.03009188542530...|       1.0|
|  0.0|(80620,[0

In [32]:
#used for multi classification problem
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 

In [33]:
#evaluating the accuracy of the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
 metricName="accuracy")
accuracy = evaluator.evaluate(test_results_nb)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.3827981182245858


# Decision Tree

In [34]:
#fitting the model on the dataset
spam_predictor=dtc.fit(training)

In [35]:
#transforming the model
test_results=spam_predictor.transform(testing)

In [36]:
#viewing the results of the test data
test_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(80620,[0,1,3,7,6...|[3102.0,2474.0,14...|[0.30219191427179...|       0.0|
|  0.0|(80620,[0,2,12,70...|[3102.0,2474.0,14...|[0.30219191427179...|       0.0|
|  0.0|(80620,[0,3,12,21...|[3102.0,2474.0,14...|[0.30219191427179...|       0.0|
|  0.0|(80620,[0,5,8,23,...|[38.0,202.0,11.0,...|[0.08920187793427...|       1.0|
|  0.0|(80620,[0,9,12,10...|[3102.0,2474.0,14...|[0.30219191427179...|       0.0|
|  0.0|(80620,[0,9,12,12...|[3102.0,2474.0,14...|[0.30219191427179...|       0.0|
|  0.0|(80620,[0,12,24,3...|[201.0,22.0,12.0,...|[0.55988857938718...|       0.0|
|  0.0|(80620,[0,13,14,1...|[2465.0,2274.0,28...|[0.25671735055196...|       2.0|
|  0.0|(80620,[0,13,46,8...|[3102.0,2474.0,14...|[0.30219191427179...|       0.0|
|  0.0|(80620,[0

In [37]:
#instantiating the classfication evaluator
acc_eval=MulticlassClassificationEvaluator()

In [38]:
#evaluating the accuracy of the model
acc=acc_eval.evaluate(test_results)

In [39]:
print ("Accuracy of the model is::", acc)

Accuracy of the model is:: 0.27065937557048814


# Random Forest

In [40]:
#fitting the model on the dataset
spam_predictor_rf=rf.fit(training)

In [41]:
#transforming the model
test_results_rf=spam_predictor_rf.transform(testing)

In [42]:
#viewing the results of the test data
test_results_rf.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(80620,[0,1,3,7,6...|[5.60104606499630...|[0.28005230324981...|       0.0|
|  0.0|(80620,[0,2,12,70...|[5.53855401644105...|[0.27692770082205...|       0.0|
|  0.0|(80620,[0,3,12,21...|[5.69880669421629...|[0.28494033471081...|       0.0|
|  0.0|(80620,[0,5,8,23,...|[5.56385121263016...|[0.27819256063150...|       0.0|
|  0.0|(80620,[0,9,12,10...|[5.36301256333227...|[0.26815062816661...|       0.0|
|  0.0|(80620,[0,9,12,12...|[5.55481383646610...|[0.27774069182330...|       0.0|
|  0.0|(80620,[0,12,24,3...|[5.6518393691567,...|[0.28259196845783...|       0.0|
|  0.0|(80620,[0,13,14,1...|[5.55481383646610...|[0.27774069182330...|       0.0|
|  0.0|(80620,[0,13,46,8...|[5.46099572704180...|[0.27304978635209...|       0.0|
|  0.0|(80620,[0

In [43]:
#evaluating the accuracy of the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
 metricName="accuracy")
accuracy_rf = evaluator.evaluate(test_results_rf)
print("Test set accuracy = " + str(accuracy_rf))

Test set accuracy = 0.28523215381468603


# Model Selection

In [44]:
import pandas as pd
model_errors= pd.DataFrame({
    "Model": ["DecisionTreeClassifier","RandomForestClassifier", "NaivesBayes"],
    "Score": [acc, accuracy,accuracy_rf]
})

In [45]:
model_errors.sort_values(by='Score',ascending=True)

Unnamed: 0,Model,Score
0,DecisionTreeClassifier,0.270659
2,NaivesBayes,0.285232
1,RandomForestClassifier,0.382798


Random Forest classifier is giving the best accuracy hence, I have selected random forest