data: https://www.kaggle.com/pdunton/marvel-cinematic-universe-dialogue?select=mcu_subset.csv
data NRC : https://www.kaggle.com/andradaolteanu/bing-nrc-afinn-lexicons?select=NRC.csv

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover, RegexTokenizer, PCA
from pyspark.mllib.regression import LabeledPoint
from IPython.display import Image
from pyspark.sql import SparkSession
import IPython

# Collecting The Infinity Stones

### AKA Cleaning the dataset

![display image](https://media.giphy.com/media/3oxHQjRHcp4w9oi24M/giphy.gif)

In [None]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [None]:
spark = init_spark()
#read_csv = spark.read.csv('data/tweets.csv', inferSchema=True, header=True)
read_csv = spark.read.csv('data/Reddit_Data_utf8.csv', inferSchema=True, header=True)

In [None]:
#data = read_csv.select("SentimentText", col("Sentiment").cast("Int").alias("label"))
data = read_csv.select("clean_comment", col("category").cast("Int").alias("label")).dropna().dropDuplicates().replace(-1,2).limit(15000)
data.show(10)

In [None]:
split = data.randomSplit([0.7, 0.3])
trainingData = split[0]
testingData = split[1]
print ("Training data has", split[0].count(), 'rows.')
print ("Testing data has", split[1].count(), 'rows.')

## Cleaning The Data (Tokenizing and Stop Word Removing)

In [None]:
#inputCol = "SentimentText"
inputCol = "clean_comment"

tokenizer = RegexTokenizer(pattern=r'(?:\p{Punct}|\s)+', inputCol=inputCol, outputCol="Tokens")
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="NoStopWords")

token_train = tokenizer.transform(trainingData)
nosw_train = swr.transform(token_train)

token_test = tokenizer.transform(testingData)
nosw_test = swr.transform(token_test)

nosw_train.show(truncate=True, n=10)
nosw_test.show(truncate=True, n=10)

## Hashing The Features using HashingTF

In [None]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
hash_train = hashTF.transform(nosw_train).select(
    'label', 'Tokens', 'features')

hash_test = hashTF.transform(nosw_test).select(
    'Label', 'Tokens', 'features')
hash_train.show(n=5)
hash_test.show(n=5)

# Training 

In [None]:
mlor = (LogisticRegression()
       .setFamily("multinomial") )

In [None]:
model= mlor.fit(hash_train)

In [None]:
prediction = model.transform(hash_test)
prediction.show(10)

# Accuracy model

In [None]:
predictionFinal = prediction.select(
    "Tokens", "prediction", "Label")
predictionFinal.show(n=100)

In [None]:
match = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['label']).count()
total = predictionFinal.count()
print("Accuracy:", match/total)

# Avengers Assemble

![display image](https://media.giphy.com/media/j2pWZpr5RlpCodOB0d/giphy.gif)

In [None]:
mcu_csv = spark.read.csv('data/mcu_subset.csv', inferSchema=True, header=True)
print("Lines of Dialogue:", mcu_csv.count())

In [None]:
data = mcu_csv.select("character","line")
data.show(n=10)

In [None]:
t = Tokenizer(inputCol="line", outputCol="new_line")
swr_MCU = StopWordsRemover(inputCol=t.getOutputCol(), 
                       outputCol="new")
token_MCU = t.transform(data)
nosw_MCU = swr_MCU.transform(token_MCU)

nosw_MCU.show(n=10)

In [None]:
hashTF = HashingTF(inputCol=swr_MCU.getOutputCol(), outputCol="features")
hash_MCU = hashTF.transform(SwRemovedMCU).select('new_line', 'features')
hash_MCU.show(n=3)

In [None]:
prediction = model.transform(numeric_MCU)
predictionFinal_mcu = prediction.select(
    "new_line", "prediction")
predictionFinal_mcu.show(n=300)

In [None]:
test = predictionFinal_mcu.groupBy('prediction').count()
test.show()