data: https://www.kaggle.com/pdunton/marvel-cinematic-universe-dialogue?select=mcu_subset.csv
data NRC : https://www.kaggle.com/andradaolteanu/bing-nrc-afinn-lexicons?select=NRC.csv

In [18]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover, RegexTokenizer, PCA
from pyspark.mllib.regression import LabeledPoint
from IPython.display import Image
from pyspark.sql import SparkSession
import IPython
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



# Collecting The Infinity Stones

### AKA Cleaning the dataset

![display image](https://media.giphy.com/media/3oxHQjRHcp4w9oi24M/giphy.gif)

In [2]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [3]:
spark = init_spark()
#read_csv = spark.read.csv('data/tweets.csv', inferSchema=True, header=True)
read_csv = spark.read.csv('data/Reddit_Data_utf8.csv', inferSchema=True, header=True)

In [4]:
#data = read_csv.select("SentimentText", col("Sentiment").cast("Int").alias("label"))
data = read_csv.select("clean_comment", col("category").cast("Int").alias("label")).dropna().dropDuplicates().replace(-1,2).limit(1000)
data.show(10)

+--------------------+-----+
|       clean_comment|label|
+--------------------+-----+
|surprised modis i...|    1|
|     naga downs ing |    0|
| has been decided...|    0|
|how difficult was...|    1|
|yes now are going...|    0|
|every question yo...|    0|
|  the plot thickens |    0|
|this compelling y...|    2|
|video showing mh3...|    0|
|brace yourself th...|    0|
+--------------------+-----+
only showing top 10 rows



In [5]:
split = data.randomSplit([0.7, 0.3])
trainingData = split[0]
testingData = split[1]
print ("Training data has", split[0].count(), 'rows.')
print ("Testing data has", split[1].count(), 'rows.')

Training data has 716 rows.
Testing data has 284 rows.


## Cleaning The Data (Tokenizing and Stop Word Removing)

In [6]:
#inputCol = "SentimentText"
inputCol = "clean_comment"

tokenizer = RegexTokenizer(pattern=r'(?:\p{Punct}|\s)+', inputCol=inputCol, outputCol="Tokens")
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="NoStopWords")

token_train = tokenizer.transform(trainingData)
nosw_train = swr.transform(token_train)

token_test = tokenizer.transform(testingData)
nosw_test = swr.transform(token_test)

nosw_train.show(truncate=True, n=10)
nosw_test.show(truncate=True, n=10)

+--------------------+-----+--------------------+--------------------+
|       clean_comment|label|              Tokens|         NoStopWords|
+--------------------+-----+--------------------+--------------------+
| 10lac suit 100rs...|    0|[10lac, suit, 100...|[10lac, suit, 100...|
| 2015 utc 23modi ...|    0|[2015, utc, 23mod...|[2015, utc, 23mod...|
| about economics ...|    1|[about, economics...|[economics, gujar...|
| adam sniff knowi...|    1|[adam, sniff, kno...|[adam, sniff, kno...|
| aerospace engine...|    0|[aerospace, engin...|[aerospace, engin...|
| all the signs em...|    1|[all, the, signs,...|[signs, emergency...|
| also appreciate ...|    1|[also, appreciate...|[also, appreciate...|
| also said easy f...|    1|[also, said, easy...|[also, said, easy...|
|     are the legion |    0|  [are, the, legion]|            [legion]|
| are you saying t...|    0|[are, you, saying...|[saying, tally, b...|
+--------------------+-----+--------------------+--------------------+
only s

## Hashing The Features using HashingTF

In [7]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
hash_train = hashTF.transform(nosw_train).select(
    'label', 'Tokens', 'features')

hash_test = hashTF.transform(nosw_test).select(
    'Label', 'Tokens', 'features')
hash_train.show(n=5)
hash_test.show(n=5)

+-----+--------------------+--------------------+
|label|              Tokens|            features|
+-----+--------------------+--------------------+
|    0|[10lac, suit, 100...|(262144,[90138,18...|
|    0|[2015, utc, 23mod...|(262144,[30914,75...|
|    1|[about, economics...|(262144,[32958,49...|
|    1|[adam, sniff, kno...|(262144,[4757,288...|
|    0|[aerospace, engin...|(262144,[216916,2...|
+-----+--------------------+--------------------+
only showing top 5 rows

+-----+--------------------+--------------------+
|Label|              Tokens|            features|
+-----+--------------------+--------------------+
|    0|[5ppr, for, flex,...|(262144,[5972,147...|
|    1|[add, some, conte...|(262144,[2306,853...|
|    1|[all, the, people...|(262144,[41809,61...|
|    1|[and, honest, see...|(262144,[8807,323...|
|    1|[anything, would,...|(262144,[16293,34...|
+-----+--------------------+--------------------+
only showing top 5 rows



# Training 

In [8]:
mlor = (LogisticRegression()
       .setFamily("multinomial") )

In [9]:
model= mlor.fit(hash_train)

In [10]:
prediction = model.transform(hash_test)
#prediction.show(100)

# Accuracy model

In [11]:
predictionFinal = prediction.select(
    "Tokens", "prediction", "Label")
predictionFinal.show(n=100)

+--------------------+----------+-----+
|              Tokens|prediction|Label|
+--------------------+----------+-----+
|[5ppr, for, flex,...|       1.0|    0|
|[add, some, conte...|       1.0|    1|
|[all, the, people...|       1.0|    1|
|[and, honest, see...|       1.0|    1|
|[anything, would,...|       0.0|    1|
|[bjp, ministers, ...|       1.0|    0|
|[breaking, news, ...|       0.0|    0|
|[constitution, ne...|       1.0|    0|
|[day, will, come,...|       0.0|    0|
|[didn, also, said...|       1.0|    1|
|[don, see, how, t...|       0.0|    2|
|[don, see, why, n...|       2.0|    1|
|[echo, the, senti...|       1.0|    1|
|[explained, this,...|       1.0|    2|
|[feel, like, part...|       0.0|    0|
|[feel, podcasts, ...|       1.0|    1|
|[feels, this, cou...|       1.0|    1|
|[few, qustions, w...|       1.0|    1|
|[game, just, incr...|       1.0|    1|
|[gentle, why, was...|       1.0|    0|
|   [get, girlfriend]|       0.0|    0|
|[good, see, him, ...|       1.0|    1|


In [12]:
match = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['label']).count()
total = predictionFinal.count()
print("Accuracy:", match/total)

Accuracy: 0.573943661971831


In [None]:
temp = predictionFinal.drop('Tokens')

# predictionAndLabels = temp.rdd.map(lambda lp: (float(prediciton), lp.label))
# predictionAndLabels = temp.rdd.map(lambda lp: (float(model.predict(lp.features)), lp.label))

#create evaluators
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="Label", predictionCol="prediction")

# get metrics
f1 = evaluatorMulti.evaluate(temp, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(temp, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(temp, {evaluatorMulti.metricName: "weightedRecall"})

print("Summary Stats")
print("Precision = %s" % weightedPrecision)
print("Recall = %s" % weightedRecall)
print("F1 Score = %s" % f1)

In [22]:
# metrics = MulticlassMetrics(predictionAndLabels)

# # Overall statistics
# precision = metrics.precision(1.0)
# recall = metrics.recall(1.0)
# f1Score = metrics.fMeasure(1.0)
# print("Summary Stats")
# print("Precision = %s" % precision)
# print("Recall = %s" % recall)
# print("F1 Score = %s" % f1Score)

# Avengers Assemble

![display image](https://media.giphy.com/media/j2pWZpr5RlpCodOB0d/giphy.gif)

In [None]:
mcu_csv = spark.read.csv('data/mcu_subset.csv', inferSchema=True, header=True)
print("Lines of Dialogue:", mcu_csv.count())

In [None]:
data = mcu_csv.select("character","line")
data.show(n=10)

In [None]:
t = Tokenizer(inputCol="line", outputCol="new_line")
swr_MCU = StopWordsRemover(inputCol=t.getOutputCol(), 
                       outputCol="new")
token_MCU = t.transform(data)
nosw_MCU = swr_MCU.transform(token_MCU)

nosw_MCU.show(n=10)

In [None]:
hashTF = HashingTF(inputCol=swr_MCU.getOutputCol(), outputCol="features")
hash_MCU = hashTF.transform(nosw_MCU).select('new_line', 'features')
hash_MCU.show(n=3)

In [None]:
prediction = model.transform(hash_MCU)
predictionFinal_mcu = prediction.select(
    "new_line", "prediction")
predictionFinal_mcu.show(n=300)

In [None]:
test = predictionFinal_mcu.groupBy('prediction').count()
test.show()