data: https://www.kaggle.com/pdunton/marvel-cinematic-universe-dialogue?select=mcu_subset.csv
data NRC : https://www.kaggle.com/andradaolteanu/bing-nrc-afinn-lexicons?select=NRC.csv

In [495]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover, RegexTokenizer
from IPython.display import Image
import IPython

# Collecting The Infinity Stones

### AKA Cleaning the dataset

![display image](https://media.giphy.com/media/3oxHQjRHcp4w9oi24M/giphy.gif)

In [482]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [483]:
spark = init_spark()
#read_csv = spark.read.csv('data/tweets.csv', inferSchema=True, header=True)
read_csv = spark.read.csv('data/Reddit_Data_utf8.csv', inferSchema=True, header=True)

In [484]:
#data = read_csv.select("SentimentText", col("Sentiment").cast("Int").alias("label"))
data = read_csv.select("clean_comment", col("category").cast("Int").alias("label")).dropna().dropDuplicates()
data.show(10)

+--------------------+-----+
|       clean_comment|label|
+--------------------+-----+
|surprised modis i...|    1|
|     naga downs ing |    0|
| has been decided...|    0|
|how difficult was...|    1|
|yes now are going...|    0|
|every question yo...|    0|
|  the plot thickens |    0|
|this compelling y...|   -1|
|video showing mh3...|    0|
|brace yourself th...|    0|
+--------------------+-----+
only showing top 10 rows



In [485]:
split = data.randomSplit([0.7, 0.3])
trainingData = split[0]
testingData = split[1]
print ("Training data has", split[0].count(), 'rows.')
print ("Testing data has", split[1].count(), 'rows.')

Training data has 25465 rows.
Testing data has 11074 rows.


## Cleaning The Data (Tokenizing and Stop Word Removing)

In [486]:
#inputCol = "SentimentText"
inputCol = "clean_comment"

tokenizer = RegexTokenizer(pattern=r'(?:\p{Punct}|\s)+', inputCol=inputCol, outputCol="Tokens")
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="NoStopWords")

token_train = tokenizer.transform(trainingData)
nosw_train = swr.transform(token_train)

token_test = tokenizer.transform(testingData)
nosw_test = swr.transform(token_test)

nosw_train.show(truncate=True, n=10)
nosw_test.show(truncate=True, n=10)

+--------------------+-----+--------------------+--------------------+
|       clean_comment|label|              Tokens|         NoStopWords|
+--------------------+-----+--------------------+--------------------+
| 5ppr for flex sp...|    0|[5ppr, for, flex,...|[5ppr, flex, spot...|
| all the signs em...|    1|[all, the, signs,...|[signs, emergency...|
| assholery has fa...|    0|[assholery, has, ...|[assholery, face,...|
| back vent here a...|    0|[back, vent, here...|  [back, vent, lose]|
| big political le...|    1|[big, political, ...|[big, political, ...|
| bjp will proacti...|   -1|[bjp, will, proac...|[bjp, proactive, ...|
| bunch aap suppor...|   -1|[bunch, aap, supp...|[bunch, aap, supp...|
| couldn see havin...|    1|[couldn, see, hav...|[couldn, see, pos...|
| doing decent job...|    1|[doing, decent, j...|[decent, job, com...|
| don know ethical...|    1|[don, know, ethic...|[know, ethical, f...|
+--------------------+-----+--------------------+--------------------+
only s

## Hashing The Features using HashingTF

In [487]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
hash_train = hashTF.transform(nosw_train).select(
    'label', 'Tokens', 'features')

hash_test = hashTF.transform(nosw_test).select(
    'Label', 'Tokens', 'features')
hash_train.show(n=3)
hash_test.show(n=3)

+-----+--------------------+--------------------+
|label|              Tokens|            features|
+-----+--------------------+--------------------+
|    0|[5ppr, for, flex,...|(262144,[5972,147...|
|    1|[all, the, signs,...|(262144,[2306,256...|
|    0|[assholery, has, ...|(262144,[13644,20...|
+-----+--------------------+--------------------+
only showing top 3 rows

+-----+--------------------+--------------------+
|Label|              Tokens|            features|
+-----+--------------------+--------------------+
|    0|[are, you, saying...|(262144,[160395,1...|
|    0|[because, fucking...|(262144,[41748,43...|
|    0|[breaking, news, ...|(262144,[18981,27...|
+-----+--------------------+--------------------+
only showing top 3 rows



# Training 

In [502]:
#lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.01)
rdd = hash_train.rdd.map(tuple)
model = LogisticRegressionWithLBFGS.train(rdd, numClasses=3)
#model = lr.fit(hash_train)

AttributeError: 'tuple' object has no attribute 'features'

# Accuracy model

In [472]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select(
    "Tokens", "prediction", "Label")
predictionFinal.show(n=20, truncate = False)

+-----------------------------------+----------+-----+
|Tokens                             |prediction|Label|
+-----------------------------------+----------+-----+
|[i, adore, cheese, #toptastic]     |1.0       |1    |
|[i, adore, jam, #brilliant]        |1.0       |1    |
|[i, adore, jam, #thumbs-up]        |1.0       |1    |
|[i, adore, jam, #toptastic]        |1.0       |1    |
|[i, adore, skiing, #brilliant]     |1.0       |1    |
|[i, adore, summer, #thumbs-up]     |1.0       |1    |
|[i, adore, summer, #toptastic]     |1.0       |1    |
|[i, adore, tea, #thumbs-up]        |1.0       |1    |
|[i, adore, that, band, #loveit]    |1.0       |1    |
|[i, adore, that, movie, #toptastic]|1.0       |0    |
|[i, adore, this, book, #bestever]  |1.0       |1    |
|[i, adore, this, game, #favorite]  |1.0       |1    |
|[i, adore, this, game, #loveit]    |1.0       |1    |
|[i, adore, winter, #brilliant]     |1.0       |1    |
|[i, adore, winter, #loveit]        |1.0       |1    |
|[i, adore

In [344]:
tn = predictionFinal.filter((predictionFinal['prediction'] == 0) & (predictionFinal['label'] == 0)).count()
tp = predictionFinal.filter((predictionFinal['prediction'] == 1) & (predictionFinal['label'] == 1)).count()
total = predictionFinal.count()
print("Accuracy:", (tn+tp)/total)
print("True Negative:", tn)
print("True Positive:", tp)

Accuracy: 0.9867021276595744
True Negative: 196
True Positive: 175


# Avengers Assemble

![display image](https://media.giphy.com/media/j2pWZpr5RlpCodOB0d/giphy.gif)

In [345]:
mcu_csv = spark.read.csv('data/mcu_subset.csv', inferSchema=True, header=True)
print("Lines of Dialogue:", mcu_csv.count())

Lines of Dialogue: 6509


In [346]:
data = mcu_csv.select("character","line")
data.show(n=10)

+------------+--------------------+
|   character|                line|
+------------+--------------------+
|  TONY STARK|Oh, I get it.  Yo...|
|  TONY STARK|Oh.  I see.  So i...|
|  TONY STARK|Good God, you’re ...|
|  TONY STARK|             Please.|
|  TONY STARK|Excellent questio...|
|  TONY STARK|      Join the club.|
|  TONY STARK|Are you aware tha...|
|JAMES RHODES|GET DOWN, TONY.  ...|
|JAMES RHODES|As Program Manage...|
|  TONY STARK|...you think we’r...|
+------------+--------------------+
only showing top 10 rows



In [347]:
t = Tokenizer(inputCol="line", outputCol="new_line")
swr_MCU = StopWordsRemover(inputCol=t.getOutputCol(), 
                       outputCol="new")
token_MCU = t.transform(data)
nosw_MCU = swr_MCU.transform(token_MCU)

nosw_MCU.show(n=10)

+------------+--------------------+--------------------+--------------------+
|   character|                line|            new_line|                 new|
+------------+--------------------+--------------------+--------------------+
|  TONY STARK|Oh, I get it.  Yo...|[oh,, i, get, it....|[oh,, get, it., ,...|
|  TONY STARK|Oh.  I see.  So i...|[oh., , i, see., ...|[oh., , see., , i...|
|  TONY STARK|Good God, you’re ...|[good, god,, you’...|[good, god,, you’...|
|  TONY STARK|             Please.|           [please.]|           [please.]|
|  TONY STARK|Excellent questio...|[excellent, quest...|[excellent, quest...|
|  TONY STARK|      Join the club.|  [join, the, club.]|       [join, club.]|
|  TONY STARK|Are you aware tha...|[are, you, aware,...|[aware, native, a...|
|JAMES RHODES|GET DOWN, TONY.  ...|[get, down,, tony...|[get, down,, tony...|
|JAMES RHODES|As Program Manage...|[as, program, man...|[program, manager...|
|  TONY STARK|...you think we’r...|[...you, think, w...|[...you,

In [348]:
hashTF = HashingTF(inputCol=swr_MCU.getOutputCol(), outputCol="features")
hash_MCU = hashTF.transform(SwRemovedMCU).select('new_line', 'features')
hash_MCU.show(n=3)

+--------------------+--------------------+
|            new_line|            features|
+--------------------+--------------------+
|[oh,, i, get, it....|(262144,[44954,84...|
|[oh., , i, see., ...|(262144,[8938,109...|
|[good, god,, you’...|(262144,[6808,353...|
+--------------------+--------------------+
only showing top 3 rows



In [349]:
prediction = model.transform(numeric_MCU)
predictionFinal_mcu = prediction.select(
    "new_line", "prediction")
predictionFinal_mcu.show(n=300)

+--------------------+----------+
|            new_line|prediction|
+--------------------+----------+
|[oh,, i, get, it....|       0.0|
|[oh., , i, see., ...|       0.0|
|[good, god,, you’...|       0.0|
|           [please.]|       0.0|
|[excellent, quest...|       0.0|
|  [join, the, club.]|       0.0|
|[are, you, aware,...|       0.0|
|[get, down,, tony...|       0.0|
|[as, program, man...|       1.0|
|[...you, think, w...|       0.0|
|[hold, on, a, sec...|       0.0|
|[yeah., , they, s...|       0.0|
|[okay,, let’s, do...|       0.0|
|[a, lot, of, peop...|       0.0|
|[it, belongs, to,...|       0.0|
|[what’s, wrong, w...|       0.0|
|[hold, that, thou...|       0.0|
|[...you, just, bl...|       0.0|
|[yeah., , don’t, ...|       0.0|
|[everything’s, fu...|       0.0|
|[no., , you’re, n...|       0.0|
|[we’ve, got, a, h...|       0.0|
|[one, more, stop....|       0.0|
|[this, is, no, jo...|       0.0|
|[this, system, ha...|       0.0|
|[tony,, it’s, the...|       0.0|
|[...jim,, how

In [350]:
test = predictionFinal_mcu.groupBy('prediction').count()
test.show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 6187|
|       1.0|  322|
+----------+-----+

