mcu data: https://www.kaggle.com/pdunton/marvel-cinematic-universe-dialogue?select=mcu_subset.csv


dataset: https://www.kaggle.com/cosmos98/twitter-and-reddit-sentimental-analysis-dataset

In [158]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover, RegexTokenizer, PCA
from pyspark.mllib.regression import LabeledPoint
from IPython.display import Image
from pyspark.sql import SparkSession
import IPython
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Collecting The Infinity Stones

### AKA Cleaning the dataset

![display image](https://media.giphy.com/media/3oxHQjRHcp4w9oi24M/giphy.gif)

In [102]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [103]:
spark = init_spark()
#read_csv = spark.read.csv('data/tweets.csv', inferSchema=True, header=True)
read_csv = spark.read.csv('data/Twitter_Data_utf8.csv', inferSchema=True, header=True)

In [104]:
#data = read_csv.select("SentimentText", col("Sentiment").cast("Int").alias("label"))
data = read_csv.select("clean_text", col("category").cast("Int").alias("label")).dropna().dropDuplicates().replace(-1,2)
data.show(10)

+--------------------+-----+
|          clean_text|label|
+--------------------+-----+
|one vote can make...|    0|
|congress has alwa...|    2|
|and here another ...|    1|
|kya modi asia bhi...|    0|
|yes vote for modi...|    0|
|better than your ...|    1|
|got one more reas...|    1|
|           very sad |    2|
|yes sir vote for ...|    0|
|evry modi bjp sup...|    0|
+--------------------+-----+
only showing top 10 rows



In [105]:
split = data.randomSplit([0.7, 0.3])
trainingData = split[0]
testingData = split[1]
print ("Training data has", split[0].count(), 'rows.')
print ("Testing data has", split[1].count(), 'rows.')

Training data has 110313 rows.
Testing data has 47295 rows.


## Cleaning The Data (Tokenizing and Stop Word Removing)

In [106]:
#inputCol = "SentimentText"
inputCol = "clean_text"

tokenizer = RegexTokenizer(pattern=r'(?:\p{Punct}|\s)+', inputCol=inputCol, outputCol="Tokens")
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="NoStopWords")

token_train = tokenizer.transform(trainingData)
nosw_train = swr.transform(token_train)

token_test = tokenizer.transform(testingData)
nosw_test = swr.transform(token_test)

nosw_train.show(truncate=True, n=10)
nosw_test.show(truncate=True, n=10)

+--------------------+-----+--------------------+--------------------+
|          clean_text|label|              Tokens|         NoStopWords|
+--------------------+-----+--------------------+--------------------+
|118 work underway...|    2|[118, work, under...|[118, work, under...|
|1947 lekar 2013 t...|    1|[1947, lekar, 201...|[1947, lekar, 201...|
|2014 modiji start...|    0|[2014, modiji, st...|[2014, modiji, st...|
|299 for one signe...|    0|[299, for, one, s...|[299, one, signed...|
|5yrs antinatnl th...|    1|[5yrs, antinatnl,...|[5yrs, antinatnl,...|
|                TRUE|    1|              [true]|              [true]|
|aare bhai modi bh...|    1|[aare, bhai, modi...|[aare, bhai, modi...|
|abey dhakkan bol ...|    0|[abey, dhakkan, b...|[abey, dhakkan, b...|
|able grandson abl...|    1|[able, grandson, ...|[able, grandson, ...|
|achievement the c...|    2|[achievement, the...|[achievement, cou...|
+--------------------+-----+--------------------+--------------------+
only s

## Hashing The Features using HashingTF

In [107]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
hash_train = hashTF.transform(nosw_train).select(
    'label', 'Tokens', 'features')

hash_test = hashTF.transform(nosw_test).select(
    'Label', 'Tokens', 'features')
hash_train.show(n=5)
hash_test.show(n=5)

+-----+--------------------+--------------------+
|label|              Tokens|            features|
+-----+--------------------+--------------------+
|    2|[118, work, under...|(262144,[1928,126...|
|    1|[1947, lekar, 201...|(262144,[1900,978...|
|    0|[2014, modiji, st...|(262144,[71,4167,...|
|    0|[299, for, one, s...|(262144,[21823,98...|
|    1|[5yrs, antinatnl,...|(262144,[333,9618...|
+-----+--------------------+--------------------+
only showing top 5 rows

+-----+--------------------+--------------------+
|Label|              Tokens|            features|
+-----+--------------------+--------------------+
|    0|[1500, crores, th...|(262144,[6993,104...|
|    0|[abhinandan, ever...|(262144,[10723,37...|
|    1|[about, jail, aft...|(262144,[40963,41...|
|    2|[absolutely, noth...|(262144,[33555,36...|
|    1|[after, hearing, ...|(262144,[4214,157...|
+-----+--------------------+--------------------+
only showing top 5 rows



# Training 

In [108]:
mlor = (LogisticRegression()
       .setFamily("multinomial") )

In [109]:
model= mlor.fit(hash_train)

In [110]:
prediction = model.transform(hash_test)
prediction.show(100)

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|Label|              Tokens|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|    0|[1500, crores, th...|(262144,[6993,104...|[47.8878290069156...|[1.0,1.6472490886...|       0.0|
|    0|[abhinandan, ever...|(262144,[10723,37...|[3.14309236377830...|[0.91588242739686...|       0.0|
|    1|[about, jail, aft...|(262144,[40963,41...|[-17.169996840690...|[2.62690279925324...|       1.0|
|    2|[absolutely, noth...|(262144,[33555,36...|[-14.739315007170...|[3.20872784885822...|       2.0|
|    1|[after, hearing, ...|(262144,[4214,157...|[-43.210840407277...|[9.31566988898557...|       1.0|
|    1|[ahmed, bhai, bad...|(262144,[9398,137...|[21.0044497071233...|[0.77746209682796...|       0.0|
|    2|[all, the, dogs, ...|(262144,[40963,54...|[9.87950536038719...|[0.

# Accuracy model

In [111]:
predictionFinal = prediction.select(
    "Tokens", "prediction", "Label")
predictionFinal.show(n=100)

+--------------------+----------+-----+
|              Tokens|prediction|Label|
+--------------------+----------+-----+
|[1500, crores, th...|       0.0|    0|
|[abhinandan, ever...|       0.0|    0|
|[about, jail, aft...|       1.0|    1|
|[absolutely, noth...|       2.0|    2|
|[after, hearing, ...|       1.0|    1|
|[ahmed, bhai, bad...|       0.0|    1|
|[all, the, dogs, ...|       0.0|    2|
|[already, exists,...|       0.0|    0|
|[always, speak, a...|       1.0|    2|
|[amit, shah, star...|       1.0|    1|
|[and, btw, not, a...|       2.0|    0|
|[and, give, shelt...|       1.0|    1|
|[and, would, rath...|       1.0|    1|
|[another, victory...|       0.0|    1|
|[answer, why, you...|       0.0|    1|
|[anyone, who, thi...|       0.0|    0|
|[anything, good, ...|       1.0|    1|
|[artworks, owned,...|       0.0|    0|
|[ask, how, many, ...|       2.0|    2|
|[balakot, will, n...|       1.0|    1|
|[bcs, its, bjp, w...|       1.0|    1|
|[because, scienti...|       0.0|    0|


In [112]:
match = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['label']).count()
total = predictionFinal.count()
print("Accuracy:", match/total)

Accuracy: 0.6986362194735173


In [159]:
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="Label", predictionCol="prediction")

# get metrics
f1 = evaluatorMulti.evaluate(temp, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(temp, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(temp, {evaluatorMulti.metricName: "weightedRecall"})

print("Summary Stats")
print("Precision = %s" % weightedPrecision)
print("Recall = %s" % weightedRecall)
print("F1 Score = %s" % f1)

Summary Stats
Precision = 0.681445262593316
Recall = 0.678778401251035
F1 Score = 0.679442043936087


# Avengers Assemble

![display image](https://media.giphy.com/media/j2pWZpr5RlpCodOB0d/giphy.gif)

In [113]:
mcu_csv = spark.read.csv('data/mcu_subset.csv', inferSchema=True, header=True)
print("Lines of Dialogue:", mcu_csv.count())

Lines of Dialogue: 6509


In [114]:
data = mcu_csv.select("character","line")
data.show(n=10)

+------------+--------------------+
|   character|                line|
+------------+--------------------+
|  TONY STARK|Oh, I get it.  Yo...|
|  TONY STARK|Oh.  I see.  So i...|
|  TONY STARK|Good God, you’re ...|
|  TONY STARK|             Please.|
|  TONY STARK|Excellent questio...|
|  TONY STARK|      Join the club.|
|  TONY STARK|Are you aware tha...|
|JAMES RHODES|GET DOWN, TONY.  ...|
|JAMES RHODES|As Program Manage...|
|  TONY STARK|...you think we’r...|
+------------+--------------------+
only showing top 10 rows



In [121]:
t = Tokenizer(inputCol="line", outputCol="new_line")
swr_MCU = StopWordsRemover(inputCol=t.getOutputCol(), 
                       outputCol="new")
token_MCU = t.transform(data)
nosw_MCU = swr_MCU.transform(token_MCU)

nosw_MCU.show(n=10)

+------------+--------------------+--------------------+--------------------+
|   character|                line|            new_line|                 new|
+------------+--------------------+--------------------+--------------------+
|  TONY STARK|Oh, I get it.  Yo...|[oh,, i, get, it....|[oh,, get, it., ,...|
|  TONY STARK|Oh.  I see.  So i...|[oh., , i, see., ...|[oh., , see., , i...|
|  TONY STARK|Good God, you’re ...|[good, god,, you’...|[good, god,, you’...|
|  TONY STARK|             Please.|           [please.]|           [please.]|
|  TONY STARK|Excellent questio...|[excellent, quest...|[excellent, quest...|
|  TONY STARK|      Join the club.|  [join, the, club.]|       [join, club.]|
|  TONY STARK|Are you aware tha...|[are, you, aware,...|[aware, native, a...|
|JAMES RHODES|GET DOWN, TONY.  ...|[get, down,, tony...|[get, down,, tony...|
|JAMES RHODES|As Program Manage...|[as, program, man...|[program, manager...|
|  TONY STARK|...you think we’r...|[...you, think, w...|[...you,

In [116]:
hashTF = HashingTF(inputCol=swr_MCU.getOutputCol(), outputCol="features")
hash_MCU = hashTF.transform(nosw_MCU).select('character','new_line', 'features')
hash_MCU.show(n=3)

+----------+--------------------+--------------------+
| character|            new_line|            features|
+----------+--------------------+--------------------+
|TONY STARK|[oh,, i, get, it....|(262144,[44954,84...|
|TONY STARK|[oh., , i, see., ...|(262144,[8938,109...|
|TONY STARK|[good, god,, you’...|(262144,[6808,353...|
+----------+--------------------+--------------------+
only showing top 3 rows



In [122]:
prediction = model.transform(hash_MCU)
predictionFinal_mcu = prediction.select(
  'character', "new_line", "prediction")
predictionFinal_mcu.show(n=300)

+------------+--------------------+----------+
|   character|            new_line|prediction|
+------------+--------------------+----------+
|  TONY STARK|[oh,, i, get, it....|       0.0|
|  TONY STARK|[oh., , i, see., ...|       2.0|
|  TONY STARK|[good, god,, you’...|       0.0|
|  TONY STARK|           [please.]|       1.0|
|  TONY STARK|[excellent, quest...|       1.0|
|  TONY STARK|  [join, the, club.]|       1.0|
|  TONY STARK|[are, you, aware,...|       1.0|
|JAMES RHODES|[get, down,, tony...|       2.0|
|JAMES RHODES|[as, program, man...|       2.0|
|  TONY STARK|[...you, think, w...|       1.0|
|  TONY STARK|[hold, on, a, sec...|       1.0|
|JAMES RHODES|[yeah., , they, s...|       0.0|
|  TONY STARK|[okay,, let’s, do...|       0.0|
|JAMES RHODES|[a, lot, of, peop...|       1.0|
|  TONY STARK|[it, belongs, to,...|       1.0|
|JAMES RHODES|[what’s, wrong, w...|       0.0|
|  TONY STARK|[hold, that, thou...|       0.0|
|JAMES RHODES|[...you, just, bl...|       0.0|
|  TONY STARK

# Analysis

In [139]:
import pandas as pd

In [140]:
test = predictionFinal_mcu.groupBy('prediction').count()
test.show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 3391|
|       1.0| 1988|
|       2.0| 1130|
+----------+-----+



In [141]:
char = predictionFinal_mcu.groupBy('character', 'prediction').count()
char.show(100)

+----------------+----------+-----+
|       character|prediction|count|
+----------------+----------+-----+
|    JAMES RHODES|       1.0|   83|
|    STEVE ROGERS|       0.0|  542|
|            THOR|       0.0|  471|
|      TONY STARK|       0.0|  864|
|    PETER PARKER|       1.0|  176|
|    PETER PARKER|       0.0|  281|
|       NICK FURY|       2.0|   67|
|    BRUCE BANNER|       0.0|  179|
|      TONY STARK|       2.0|  313|
|            THOR|       1.0|  266|
|    BRUCE BANNER|       1.0|  113|
|    STEVE ROGERS|       1.0|  263|
|            LOKI|       2.0|   54|
|            LOKI|       0.0|  163|
|    JAMES RHODES|       0.0|  170|
|NATASHA ROMANOFF|       0.0|  285|
|       NICK FURY|       0.0|  188|
|    JAMES RHODES|       2.0|   65|
|NATASHA ROMANOFF|       1.0|  142|
|            THOR|       2.0|  126|
|    PEPPER POTTS|       1.0|  113|
|    STEVE ROGERS|       2.0|  172|
|      TONY STARK|       1.0|  611|
|       NICK FURY|       1.0|  111|
|    PETER PARKER|       2.0

In [154]:
df = char.toPandas()
print(df)

           character  prediction  count
0       JAMES RHODES         1.0     83
1       STEVE ROGERS         0.0    542
2               THOR         0.0    471
3         TONY STARK         0.0    864
4       PETER PARKER         1.0    176
5       PETER PARKER         0.0    281
6          NICK FURY         2.0     67
7       BRUCE BANNER         0.0    179
8         TONY STARK         2.0    313
9               THOR         1.0    266
10      BRUCE BANNER         1.0    113
11      STEVE ROGERS         1.0    263
12              LOKI         2.0     54
13              LOKI         0.0    163
14      JAMES RHODES         0.0    170
15  NATASHA ROMANOFF         0.0    285
16         NICK FURY         0.0    188
17      JAMES RHODES         2.0     65
18  NATASHA ROMANOFF         1.0    142
19              THOR         2.0    126
20      PEPPER POTTS         1.0    113
21      STEVE ROGERS         2.0    172
22        TONY STARK         1.0    611
23         NICK FURY         1.0    111


In [156]:
newf = df.pivot(index='character', columns='prediction')
newf.columns = newf.columns.droplevel(0)
print(newf)

prediction        0.0  1.0  2.0
character                      
BRUCE BANNER      179  113   87
JAMES RHODES      170   83   65
LOKI              163  110   54
NATASHA ROMANOFF  285  142   71
NICK FURY         188  111   67
PEPPER POTTS      248  113   81
PETER PARKER      281  176   94
STEVE ROGERS      542  263  172
THOR              471  266  126
TONY STARK        864  611  313
