In [6]:
from pyspark import SparkContext
from pyspark.sql import *
from pyspark.sql.functions import isnan, when, count, col, split, regexp_replace, lower
import pyspark.sql.functions as F
from pyspark.ml.feature import Word2Vec,StringIndexer,CountVectorizer,HashingTF, IDF
from pyspark.ml.classification import NaiveBayes, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import MinMaxScaler
import subprocess

ss = SparkSession.builder.master("local[*]").appName("ddam-8_STEP-3_w2v_classifiers").getOrCreate()

# Function definitions

In [7]:
def split_text(df, colname):
    return df.withColumn(colname, split(lower(col(colname)), " ")).dropna()

In [8]:
def save(df, filename):
    filepath = f"hdfs://kddrtserver12.isti.cnr.it:9000/user/hpsa06/{filename}"
    subprocess.call(["hadoop", "fs", "-rm", "-r", filepath])
    df.write.csv(filepath, header = True)

In [9]:
def evaluate_model(pred):
    evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName='areaUnderROC')

    predictionAndTarget = pred.select("label", "prediction")
    acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
    f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
    weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
    weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})
    print('\taccuracy:', acc)
    print('\tf1-score',f1)
    print('\tweightedPrecision',weightedPrecision)
    print('\tweightedRecall',weightedRecall)

In [10]:
def scaler_df(df):
    scaler = MinMaxScaler(inputCol="features", outputCol="scaledfeatures")
    scalerModel = scaler.fit(df)
    return scalerModel.transform(df)

# Vectorisation algs. definitions

In [11]:
# arbitrary feature length for all vectorizers - equal chances B-)
f_len = 400

def word2vec(df):
    word2Vec = Word2Vec(vectorSize=f_len, minCount=2, inputCol="text", outputCol="features")
    model = word2Vec.fit(df)
    return model.transform(df)

def bow(df):
    bow = CountVectorizer(inputCol="text", outputCol="features", vocabSize=f_len, minDF=5).fit(df)
    return bow.transform(df)
    
def tfidf(df):
    hashingTF = HashingTF(inputCol="text", outputCol="rawFeatures", numFeatures=f_len)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    featurizedData = hashingTF.transform(df)
    return idf.fit(featurizedData).transform(featurizedData)

# Model definitions

In [12]:
# Naive Bayes
def nb(train, test):
    DF_train = scaler_df(train)
    DF_test = scaler_df(test)
    model_nb = NaiveBayes(featuresCol='scaledfeatures', labelCol='label')
    return model_nb.fit(DF_train).transform(DF_test)

# Multilayer perceptron
def mlp(train, test):
    model_mlp = MultilayerPerceptronClassifier(layers=[f_len, int((f_len+3)/2), 3], solver='l-bfgs',
                                               featuresCol='features', labelCol='label')
    return model_mlp.fit(train).transform(test)

# Load Data and run Experiments

In [13]:
twitter_df = ss.read.csv("hdfs://kddrtserver12.isti.cnr.it:9000/user/hpsa06/df_Model.csv", sep=",", quote='"', header=True, escape='"', multiLine=True)
trump_df = ss.read.csv("hdfs://kddrtserver12.isti.cnr.it:9000/user/hpsa06/df_Trump.csv", sep=",", quote='"', header=True, escape='"', multiLine=True).drop('lat','long', 'city')
biden_df = ss.read.csv("hdfs://kddrtserver12.isti.cnr.it:9000/user/hpsa06/df_Model.csv", sep=",", quote='"', header=True, escape='"', multiLine=True).drop('lat','long', 'city')

In [14]:
twitter_df = split_text(twitter_df,'text')
twitter_df = StringIndexer(inputCol="sentiment", outputCol="label").fit(twitter_df).transform(twitter_df)

In [15]:
splits = [0.85, 0.15]

for vec_alg in [word2vec, bow, tfidf]:
    datasets = vec_alg(twitter_df).randomSplit(splits)
    for model in [nb,mlp]:
        print(vec_alg.__name__, ': ', model.__name__)
        evaluate_model(model(*datasets))

word2vec :  nb
	accuracy: 0.47127909459748024
	f1-score 0.4040379308471977
	weightedPrecision 0.5712509642738162
	weightedRecall 0.4712790945974803
word2vec :  mlp
	accuracy: 0.5558402733290626
	f1-score 0.5501303573469899
	weightedPrecision 0.5645962063482722
	weightedRecall 0.5558402733290626
bow :  nb
	accuracy: 0.6234594135146622
	f1-score 0.6169300153222861
	weightedPrecision 0.6561517606665785
	weightedRecall 0.6234594135146622
bow :  mlp
	accuracy: 0.6419464513387165
	f1-score 0.6412290389038638
	weightedPrecision 0.6576384962344565
	weightedRecall 0.6419464513387165
tfidf :  nb
	accuracy: 0.5329849771391247
	f1-score 0.5175768915558268
	weightedPrecision 0.5701901349730272
	weightedRecall 0.5329849771391247
tfidf :  mlp
	accuracy: 0.5536686261702591
	f1-score 0.5532564477835126
	weightedPrecision 0.5530715881427554
	weightedRecall 0.553668626170259
