In [None]:
#importing all required libraries here. There are some stuff that I had used previously but have not removed here
from pyspark import *
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pandas as pd
from pathlib import Path
import glob
import os
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType
#importing pyspark machine learning libraries
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

In [None]:
#I have a 8 gb ram so I have 7.6 gb available to use so I am allowing max java heap to use 6 gb
from pyspark import SparkConf
conf=SparkConf()
conf.set("spark.driver.memory", "6g")
conf.set("spark.executor.memory", "5g")
conf.set("spark.driver.cores", "4")

In [None]:
#defining spark context and spark session
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)
sc

In [None]:
#defining the schema of the json texts to be imported
schema = StructType([
    StructField('title_page', StringType(), True),
    StructField('text_new', StringType(), True),
    StructField('text_old', StringType(), True),
    StructField('name_user', StringType(), True),
    StructField('label', StringType(), True),
    StructField('comment', StringType(), True)
])

In [None]:
#this would recursively read all json files into the same data frame. ** wildcard refers to all directories
#1.6 gb of data has been used
#This is the easiest way to read json in spark
df_json = spark.read.json("/home/shourya/adv_assignment_3/**/part-*",schema, multiLine=True)

In [None]:
#Counting unique tokens in labels
from pyspark.sql.functions import col
df_json.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

In [None]:
#Changing the name of the column 'label' to 'category' as pyspark logistic model does not accept anything but 'label' as target.
df_json = df_json.withColumnRenamed("label", "category")

In [None]:
#generating class weights here
import numpy as np
y_collect = df_json.select("category").groupBy("category").count().collect()
unique_y = [x["category"] for x in y_collect]
total_y = sum([x["count"] for x in y_collect])
unique_y_count = len(y_collect)
bin_count = [x["count"] for x in y_collect]

class_weights_spark = {i: ii for i, ii in zip(unique_y, total_y / (unique_y_count * np.array(bin_count)))}
print(class_weights_spark)

In [None]:
#mapping to categories
from pyspark.sql import functions as F
from itertools import chain

mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_spark.items())])

df_json = df_json.withColumn("weight", mapping_expr.getItem(F.col("category")))

In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer_new = RegexTokenizer(inputCol="text_new", outputCol="words_new", pattern="\\W")
regexTokenizer_old = RegexTokenizer(inputCol="text_old", outputCol="words_old", pattern="\\W")
regexTokenizer_title = RegexTokenizer(inputCol="title_page", outputCol="words_title_page", pattern="\\W")
regexTokenizer_user = RegexTokenizer(inputCol="name_user", outputCol="words_name_user", pattern="\\W")
regexTokenizer_comment = RegexTokenizer(inputCol="comment", outputCol="words_comment", pattern="\\W")
# stop words
add_stopwords = ["a","an","the","on",]
stopwordsRemover_new = StopWordsRemover(inputCol="words_new", outputCol="filtered_new").setStopWords(add_stopwords)
stopwordsRemover_old = StopWordsRemover(inputCol="words_old", outputCol="filtered_old").setStopWords(add_stopwords)
stopwordsRemover_comment = StopWordsRemover(inputCol="words_comment", outputCol="filtered_comment").setStopWords(add_stopwords)
# bag of words count
countVectors_new = CountVectorizer(inputCol="filtered_new", outputCol="features_new", vocabSize=100000, minDF=5)
countVectors_old = CountVectorizer(inputCol="filtered_old", outputCol="features_old", vocabSize=100000, minDF=5)
countVectors_comment = CountVectorizer(inputCol="filtered_comment", outputCol="features_comment", vocabSize=100000, minDF=5)
countVectors_user = CountVectorizer(inputCol="words_name_user", outputCol="features_name_user", vocabSize=100000, minDF=5)
countVectors_title = CountVectorizer(inputCol="words_title_page", outputCol="features_title_page", vocabSize=100000, minDF=5)

In [None]:
from pyspark.ml import Pipeline

from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import VectorAssembler
#This would encode label into numerical values
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

#Pipeline for creating jobs by spark. This does not have a logical consequence in our case but essential.
pipeline = Pipeline(stages=[label_stringIdx, regexTokenizer_new, regexTokenizer_old, regexTokenizer_title, regexTokenizer_user, regexTokenizer_comment, stopwordsRemover_new, stopwordsRemover_old, stopwordsRemover_comment, countVectors_comment, countVectors_new, countVectors_old, countVectors_user, countVectors_title])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df_json)
dataset = pipelineFit.transform(df_json)
dataset.show(5)

In [None]:
# set seed for reproducibility
#Training and test split
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

In [None]:
#logistic regression model definition. Classweights have been added
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0,weightCol="weight")
#Pyspark would have us assemble the features using vector assembler
assembler = VectorAssembler(
    inputCols=['features_new','features_old','features_comment','features_name_user','features_title_page'],
    outputCol='features')
#Transforming the training data using the assembler
assembler.transform(trainingData)
#Pipeline based execution which is charcteristic for spark.
pipeline = Pipeline(stages=[assembler, lr])
#Fitting the model
model = pipeline.fit(trainingData)

In [None]:
#Checking the model performance on testData
predictions = model.transform(testData)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)
#without classweights result is 0.8143742529986768

In [None]:
from pyspark.ml.feature import HashingTF, IDF
#TF IDF based transformation in spark
hashingTF_name = HashingTF(inputCol="words_name_user", outputCol="rawFeatures_name", numFeatures=100)
hashingTF_title = HashingTF(inputCol="words_title_page", outputCol="rawFeatures_title", numFeatures=100)
hashingTF_new = HashingTF(inputCol="filtered_new", outputCol="rawFeatures_new", numFeatures=100)
hashingTF_old = HashingTF(inputCol="filtered_old", outputCol="rawFeatures_old", numFeatures=100)
hashingTF_comment = HashingTF(inputCol="filtered_comment", outputCol="rawFeatures_comment", numFeatures=100)
idf_name = IDF(inputCol="rawFeatures_name", outputCol="features_idf_name", minDocFreq=5) #minDocFreq: remove sparse terms
idf_title = IDF(inputCol="rawFeatures_title", outputCol="features_idf_title", minDocFreq=5) #minDocFreq: remove sparse terms
idf_new = IDF(inputCol="rawFeatures_new", outputCol="features_idf_new", minDocFreq=5) #minDocFreq: remove sparse terms
idf_old = IDF(inputCol="rawFeatures_old", outputCol="features_idf_old", minDocFreq=5) #minDocFreq: remove sparse terms
idf_comment = IDF(inputCol="rawFeatures_comment", outputCol="features_idf_comment", minDocFreq=5) #minDocFreq: remove sparse terms

In [None]:
#Same style of execution as the first logistic model now using TF IDF based features along with previous features
pipeline_ti = Pipeline(stages=[label_stringIdx, regexTokenizer_new, regexTokenizer_old, regexTokenizer_title, regexTokenizer_user, regexTokenizer_comment, stopwordsRemover_new, stopwordsRemover_old, stopwordsRemover_comment, countVectors_comment, countVectors_new, countVectors_old, countVectors_user, countVectors_title, hashingTF_name, hashingTF_title, hashingTF_new, hashingTF_old, hashingTF_comment, idf_name, idf_title, idf_new, idf_old, idf_comment])
pipelineFit_ti = pipeline_ti.fit(df_json)
dataset_ti = pipelineFit_ti.transform(df_json)
(trainingData_ti, testData_ti) = dataset_ti.randomSplit([0.7, 0.3], seed = 100)
lr_ti = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0,weightCol="weight")
assembler_ti = VectorAssembler(
    inputCols=['rawFeatures_name','rawFeatures_title','rawFeatures_new','rawFeatures_old','rawFeatures_comment','features_idf_name','features_idf_title','features_idf_new','features_idf_old','features_idf_comment'],
    outputCol='features')
assembler_ti.transform(trainingData_ti)
pipeline_ti = Pipeline(stages=[assembler_ti, lr_ti])
trainingData_ti.cache()
testData_ti.cache()
model_ti = pipeline_ti.fit(trainingData_ti)

In [None]:
# Evaluate the model without cross validation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator_ti = MulticlassClassificationEvaluator(predictionCol="prediction")

In [None]:
#10 fold cross validation with tf idf features and class weights
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
#In this section we try to run the previous model with various parameter values
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr_ti.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr_ti.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model_ti.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# Create 10-fold CrossValidator
cv = CrossValidator(estimator=lr_ti, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator_ti, \
                    numFolds=10)
assembler_ti_cv = VectorAssembler(
    inputCols=['rawFeatures_name','rawFeatures_title','rawFeatures_new','rawFeatures_old','rawFeatures_comment','features_idf_name','features_idf_title','features_idf_new','features_idf_old','features_idf_comment'],
    outputCol='features')
assembler_ti_cv.transform(trainingData_ti)
pipeline_ti_cv = Pipeline(stages=[assembler_ti_cv, cv])
trainingData_ti.cache()
testData_ti.cache()
model_ti_cv = pipeline_ti_cv.fit(trainingData_ti)

prediction = model_ti_cv.transform(testData_ti)
# Evaluate best model
evaluator_ti = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_ti.evaluate(prediction)
#performance with class weights is 0.8846150805585391

In [None]:
#Implementation of naive bayes with only primary features. Performance is 0.76960714706985
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
nb = NaiveBayes(smoothing=1,weightCol="weight")
#Pyspark would have us assemble the features using vector assembler
assembler = VectorAssembler(
    inputCols=['features_new','features_old','features_comment','features_name_user','features_title_page'],
    outputCol='features')
#Transforming the training data using the assembler
assembler.transform(trainingData)
#Pipeline based execution which is charcteristic for spark.
pipeline = Pipeline(stages=[assembler, nb])
#Fitting the model
model = pipeline.fit(trainingData)

prediction = model.transform(testData)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(prediction)

In [None]:
#random forest implmentation without proper sampling. Performance is at 0.76
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32
                           )
#Pyspark would have us assemble the features using vector assembler
assembler = VectorAssembler(
    inputCols=['features_new','features_old','features_comment','features_name_user','features_title_page'],
    outputCol='features')
#Transforming the training data using the assembler
assembler.transform(trainingData)
#Pipeline based execution which is charcteristic for spark.
pipeline = Pipeline(stages=[assembler, rf])
#Fitting the model
model = pipeline.fit(trainingData)

prediction = model.transform(testData)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(prediction)

In [None]:
#implementation of smote
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from collections import Counter
X = dataset_ti.toPandas().filter(items=["rawFeatures_name", "rawFeatures_title", "rawFeatures_new","hypertension","rawFeatures_old","rawFeatures_comment","features_idf_name","features_idf_title","features_idf_new","features_idf_old","features_idf_comment"])
Y = dataset_ti.toPandas().filter(items=["label"])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
sm = SMOTE(random_state=12) # ratio = "auto" by default, kind = "regular" by default
x_train_res, y_train_res = sm.fit_sample(X_train, Y_train)
print(‘Resampled dataset shape {}’.format(Counter(y_train_res)))

In [None]:
dataframe_1 = pd.DataFrame(x_train_res,columns=["rawFeatures_name", "rawFeatures_title", "rawFeatures_new","hypertension","rawFeatures_old","rawFeatures_comment","features_idf_name","features_idf_title","features_idf_new","features_idf_old","features_idf_comment"])

dataframe_2 = pd.DataFrame(y_train_res, columns = ["label"])
# frames = [dataframe_1, dataframe_2]
result = dataframe_1.combine_first(dataframe_2)
imputeDF_1 = spark.createDataFrame(result)
(trainingData, testData) = imputeDF_1.randomSplit([0.7, 0.3], seed = 100)

In [None]:
#random forest on class balanced data
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32
                           )
#Pyspark would have us assemble the features using vector assembler
assembler = VectorAssembler(
    inputCols=["rawFeatures_name", "rawFeatures_title", "rawFeatures_new","hypertension","rawFeatures_old","rawFeatures_comment","features_idf_name","features_idf_title","features_idf_new","features_idf_old","features_idf_comment"],
    outputCol="features")
#Transforming the training data using the assembler
assembler.transform(trainingData)
#Pipeline based execution which is charcteristic for spark.
pipeline = Pipeline(stages=[assembler, rf])
#Fitting the model
model = pipeline.fit(trainingData)

prediction = model.transform(testData)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(prediction)