<a href="https://colab.research.google.com/github/Ha-ri-ka/MIT-labs-ai-ml/blob/main/draft2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update # Update apt-get repository.
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
!tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
!pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

!ls

# Initialize findspark
import findspark
findspark.init()

# Create a PySpark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [92]:
test_path="/content/test.csv"
train_path="/content/train.csv"

In [93]:
data=spark.read.option("header", True).csv(train_path)
test_data=spark.read.option("header", True).csv(test_path)

In [94]:
test_data = test_data.withColumnRenamed("text", 'selected_text')

In [95]:
data.printSchema()

root
 |-- textID: string (nullable = true)
 |-- text: string (nullable = true)
 |-- selected_text: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- Time of Tweet: string (nullable = true)
 |-- Age of User: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Population -2020: string (nullable = true)
 |-- Land Area (Km�): string (nullable = true)
 |-- Density (P/Km�): string (nullable = true)



In [96]:
test_data.printSchema()

root
 |-- textID: string (nullable = true)
 |-- selected_text: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- Time of Tweet: string (nullable = true)
 |-- Age of User: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Population -2020: string (nullable = true)
 |-- Land Area (Km�): string (nullable = true)
 |-- Density (P/Km�): string (nullable = true)



In [97]:
data=data.select(["selected_text","sentiment"])

In [98]:
from pyspark.sql.functions import when, col
data = data.withColumn("sentiment",when(col("sentiment") == "positive", 1).when(col("sentiment") == "negative", -1).otherwise(0).cast("int"))

In [99]:
test_data=test_data.select(["selected_text","sentiment"])

In [100]:
test_data = test_data.withColumn("sentiment",when(col("sentiment") == "positive", 1).when(col("sentiment") == "negative", -1).otherwise(0).cast("int"))

In [101]:
data.show()
data.count()

+--------------------+---------+
|       selected_text|sentiment|
+--------------------+---------+
|I`d have responde...|        0|
|            Sooo SAD|       -1|
|         bullying me|       -1|
|      leave me alone|       -1|
|       Sons of ****,|       -1|
|http://www.dotheb...|        0|
|                 fun|        1|
|          Soooo high|        0|
|         Both of you|        0|
|Wow... u just bec...|        1|
|as much as i love...|        0|
|                like|        1|
|         DANGERously|       -1|
|                lost|       -1|
|test test from th...|        0|
|Uh oh, I am sunbu...|       -1|
|              *sigh*|       -1|
|                sick|       -1|
|                onna|       -1|
|Hes just not that...|        0|
+--------------------+---------+
only showing top 20 rows



27481

In [102]:
data=data.na.drop()
data.count()

27478

In [103]:
test_data.show()
test_data.count()

+--------------------+---------+
|       selected_text|sentiment|
+--------------------+---------+
|Last session of t...|        0|
| Shanghai is also...|        1|
|Recession hit Ver...|       -1|
|         happy bday!|        1|
| http://twitpic.c...|        1|
| that`s great!! w...|        1|
|I THINK EVERYONE ...|       -1|
| soooooo wish i c...|       -1|
| and within a sho...|        0|
| What did you get...|        0|
|My bike was put o...|       -1|
| I checked.  We d...|        0|
| .. and you`re on...|        0|
|I`m in VA for the...|       -1|
|Its coming out th...|       -1|
|So hot today =_= ...|       -1|
|            Miss you|       -1|
|        Cramps . . .|       -1|
| you guys didn`t ...|        1|
|I`m going into a ...|        0|
+--------------------+---------+
only showing top 20 rows



4815

In [104]:
test_data=test_data.na.drop()
test_data.count()

3534

### NLP + ML

In [105]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [119]:
# Tokenize text data
tokenizer = Tokenizer(inputCol="selected_text", outputCol="words")
# Remove stopwords
stopwords_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered_words")
# Convert sentiment labels into numerical labels
indexer = StringIndexer(inputCol="sentiment", outputCol="label", handleInvalid="skip")

## hasingtf,idf,logistic regression

In [120]:
# Convert words to numerical features using HashingTF
hashing_tf = HashingTF(inputCol=stopwords_remover.getOutputCol(), outputCol="rawFeatures")
# Compute IDF
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="features")
# logistic regression
lr = LogisticRegression(featuresCol="features", labelCol="label",maxIter=50)
# Pipeline
lr_pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, indexer, lr])

In [108]:
lr_model=lr_pipeline.fit(data)

In [109]:
lr_predictions = lr_model.transform(test_data)

In [110]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(lr_predictions)
print("Test Accuracy:", accuracy*100,"%")

Test Accuracy: 54.41426146010186 %


## count vectorizer, idf, logistic regression

In [121]:
from pyspark.ml.feature import CountVectorizer
cv=CountVectorizer(vocabSize=2**16,inputCol=tokenizer.getOutputCol(),outputCol="cved")
idf_2 = IDF(inputCol=cv.getOutputCol(), outputCol="features")
lr_2 = LogisticRegression(featuresCol="features", labelCol="label",maxIter=50)
lr_pipeline_2 = Pipeline(stages=[tokenizer, stopwords_remover, cv, idf_2, indexer, lr_2])

In [116]:
lr_model_2=lr_pipeline_2.fit(data)

In [117]:
lr_predictions_2 = lr_model_2.transform(test_data)

In [122]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
lr_accuracy_2= evaluator.evaluate(lr_predictions_2)
print("Test Accuracy:", accuracy*100,"%")

Test Accuracy: 54.41426146010186 %
