## Setting up the environment.

In [None]:
# getting spark ready

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

import findspark
findspark.init()

In [None]:
#importing modules
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover


In [None]:
# creating spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

## Loading and Viewing the Data

In [None]:
# loading tweets data
tweets = spark.read.csv("/content/drive/MyDrive/Sentiment Analysis/tweets.csv",  inferSchema=True)

In [None]:
tweets.show(truncate=False, n=5)

+---+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|_c0|_c1       |_c2                         |_c3     |_c4            |_c5                                                                                                                |
+---+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|0  |1467810369|Mon Apr 06 22:19:45 PDT 2009|NO_QUERY|_TheSpecialOne_|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|0  |1467810672|Mon Apr 06 22:19:49 PDT 2009|NO_QUERY|scotthamilton  |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |
|0  |1467810917|Mon Apr 06 22:19:53 PDT 2009|NO_QUERY|mattycus   

In [None]:
# we only need c0 and c5 for our model tweets and label
data = tweets.select(col("_c5").alias("tweet"), col("_c0").cast("Int").alias("label"))

In [None]:
data.show(5)

+--------------------+-----+
|               tweet|label|
+--------------------+-----+
|@switchfoot http:...|    0|
|is upset that he ...|    0|
|@Kenichan I dived...|    0|
|my whole body fee...|    0|
|@nationwideclass ...|    0|
+--------------------+-----+
only showing top 5 rows



## Preprocesing the Data

In [None]:
# Dividing data to train and test
df = data.randomSplit([0.7, 0.3]) 
train_df = df[0]
test_df = df[1] 
train_df.count(), test_df.count()

(1119935, 480065)

In [None]:
# creating a function to preprocess data for our model
def preprocess(data):

  # first we need to form a list of words (tokenize)
  tokenizer = Tokenizer(inputCol="tweet", outputCol="tweetTokens")
  tokenized_data = tokenizer.transform(data)

  # second we need to remove stop words 
  swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="withoutStopWords")
  swr_data = swr.transform(tokenized_data)
  
  # third we will convert our list of words to numeric features using hash transformer
  hash_transformer = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
  hashed_data = hash_transformer.transform(swr_data)

  preprocessed_data = hashed_data.select('features', 'label')

  return preprocessed_data

  


In [None]:
train_data = preprocess(train_df)
test_data = preprocess(test_df)

In [None]:
train_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(262144,[76764,23...|    0|
|(262144,[23825,74...|    0|
|(262144,[89833,16...|    0|
|(262144,[1512,125...|    0|
|(262144,[61899,23...|    0|
+--------------------+-----+
only showing top 5 rows



# Training  models

In [None]:
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# creating evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")

## Naive Bayes

In [None]:
# fitting model
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
nb_model = nb.fit(train_data)

In [None]:
# evaluating model
nb_predictions = nb_model.transform(test_data)
nb_accuracy = evaluator.evaluate(nb_predictions)
print(str(nb_accuracy))

0.385841500630123


## Logestic Regression

In [None]:
# fitting model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3)
lr_model = lr.fit(train_data)

In [None]:
# evaluating model
lr_predictions = lr_model.transform(test_data)
lr_accuracy = evaluator.evaluate(lr_predictions)
print(str(lr_accuracy))

0.7411267224230053


As Logestic regression scored higher we will proceed with it

In [None]:
# saving lr model
lr_model.save("/content/drive/MyDrive/Sentiment Analysis/lr")