data: https://www.kaggle.com/pdunton/marvel-cinematic-universe-dialogue?select=mcu_subset.csv
data NRC : https://www.kaggle.com/andradaolteanu/bing-nrc-afinn-lexicons?select=NRC.csv

In [243]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover
from IPython.display import Image
import IPython

# Collecting The Infinity Stones

## AKA Cleaning the dataset

![display image](https://media.giphy.com/media/3oxHQjRHcp4w9oi24M/giphy.gif)

In [244]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [245]:
spark = init_spark()
tweets = spark.read.csv('data/tweets.csv', inferSchema=True, header=True)

In [246]:
tweets_data = tweets.select("SentimentText", col("Sentiment").cast("Int").alias("label"))
tweets_data.show(truncate = False,n=5)

+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |1    |
|winter is terrible #thumbs-down  |0    |
|this game is awful #nightmare    |0    |
|I love jam #loveit               |1    |
+---------------------------------+-----+
only showing top 5 rows



In [247]:
split = data.randomSplit([0.7, 0.3]) 
train_rows = split[0].count()
test_rows = split[1].count()
print ("Training data has", train_rows, 'rows.')
print ("Testing data has", test_rows, 'rows.')

Training data has 4547 rows.
Testing data has 1962 rows.


## Cleaning The Data (Tokenizing and Stop Word Removing)

In [257]:
tokenizer = Tokenizer(inputCol="SentimentText", outputCol="Tokens")
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="NoStopWords")

token_train = tokenizer.transform(trainingData)
nosw_train = swr.transform(token_train)


token_test = tokenizer.transform(testingData)
nosw_test = swr.transform(token_test)

nosw_train.show(truncate=False, n=2)
nosw_test.show(truncate=False, n=2)

+-------------------------+-----+------------------------------+---------------------------+
|SentimentText            |label|Tokens                        |NoStopWords                |
+-------------------------+-----+------------------------------+---------------------------+
|I adore cheese #bestever |1    |[i, adore, cheese, #bestever] |[adore, cheese, #bestever] |
|I adore cheese #brilliant|1    |[i, adore, cheese, #brilliant]|[adore, cheese, #brilliant]|
+-------------------------+-----+------------------------------+---------------------------+
only showing top 2 rows

+-------------------------+-----+------------------------------+---------------------------+
|SentimentText            |label|Tokens                        |NoStopWords                |
+-------------------------+-----+------------------------------+---------------------------+
|I adore cheese #toptastic|1    |[i, adore, cheese, #toptastic]|[adore, cheese, #toptastic]|
|I adore jam #brilliant   |1    |[i, adore, j

## Hashing The Features using HashingTF

In [258]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
hash_train = hashTF.transform(nosw_train).select(
    'label', 'Tokens', 'features')

hash_test = hashTF.transform(nosw_test).select(
    'Label', 'Tokens', 'features')
hash_train.show(truncate=False, n=3)
hash_test.show(truncate=False, n=3)

+-----+------------------------------+-------------------------------------------+
|label|Tokens                        |features                                   |
+-----+------------------------------+-------------------------------------------+
|1    |[i, adore, cheese, #bestever] |(262144,[1689,91011,100089],[1.0,1.0,1.0]) |
|1    |[i, adore, cheese, #brilliant]|(262144,[1689,45361,100089],[1.0,1.0,1.0]) |
|1    |[i, adore, cheese, #favorite] |(262144,[1689,100089,108624],[1.0,1.0,1.0])|
+-----+------------------------------+-------------------------------------------+
only showing top 3 rows

+-----+------------------------------+--------------------------------------------+
|Label|Tokens                        |features                                    |
+-----+------------------------------+--------------------------------------------+
|1    |[i, adore, cheese, #toptastic]|(262144,[1689,42010,100089],[1.0,1.0,1.0])  |
|1    |[i, adore, jam, #brilliant]   |(262144,[45361,10008

# Training 

In [259]:
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)
model = lr.fit(numericTrainData)

# Accuracy model

In [261]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select(
    "Tokens", "prediction", "Label")
predictionFinal.show(n=4, truncate = False)

+------------------------------+----------+-----+
|Tokens                        |prediction|Label|
+------------------------------+----------+-----+
|[i, adore, cheese, #toptastic]|1.0       |1    |
|[i, adore, jam, #brilliant]   |1.0       |1    |
|[i, adore, jam, #thumbs-up]   |1.0       |1    |
|[i, adore, jam, #toptastic]   |1.0       |1    |
+------------------------------+----------+-----+
only showing top 4 rows



In [268]:
correctPrediction = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['Label']).count()
tn = predictionFinal.filter((predictionFinal['prediction'] == 0) & (predictionFinal['label'] == 0)).count()
tp = predictionFinal.filter((predictionFinal['prediction'] == 1) & (predictionFinal['label'] == 1)).count()
total = predictionFinal.count()
print("True Positive:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/total)
print("TP", tn)
print("TN", tp)

True Positive: 371 , total data: 376 , accuracy: 0.9867021276595744
TP 196
TN 175


# Avengers Assemble

![display image](https://media.giphy.com/media/j2pWZpr5RlpCodOB0d/giphy.gif)

In [269]:
mcu_csv = spark.read.csv('data/mcu_subset.csv', inferSchema=True, header=True)
print(mcu_csv.count())

6509


In [270]:
data = mcu_csv.select("character","line")
data.show(truncate = False,n=10)

+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|character   |line                                                                                                                                                                                                                                                                                                                                                                 |
+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [271]:
t = Tokenizer(inputCol="line", outputCol="new_line")
swr_MCU = StopWordsRemover(inputCol=t.getOutputCol(), 
                       outputCol="new")
token_MCU = t.transform(data)
nosw_MCU = swr_MCU.transform(token_MCU)

nosw_MCU.show(truncate=False, n=1)

+----------+------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+
|character |line                                                                                      |new_line                                                                                                         |new                                                                     |
+----------+------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+
|TONY STARK|Oh, I get it.  You guys aren’t allowed to talk.  Is that it?  Are you not allowed to talk?|[oh,, i, get, it., , you

In [272]:
hashTF = HashingTF(inputCol=swr_MCU.getOutputCol(), outputCol="features")
hash_MCU = hashTF.transform(SwRemovedMCU).select('new_line', 'features')
hash_MCU.show(truncate=False, n=3)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|new_line                                                                                                                                                                                     |features                                                                                                                                                                                          |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------

In [273]:
prediction = model.transform(numeric_MCU)
predictionFinal_mcu = prediction.select(
    "new_line", "prediction")
predictionFinal_mcu.show(n=300, truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [274]:
test = predictionFinal_mcu.groupBy('prediction').count()
test.show(n =5)

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 6187|
|       1.0|  322|
+----------+-----+

