In [0]:
import os
import atexit
import sys

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=3
tasks_per_node=12
memory_per_task=1536 #1.5 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="24:00" #24 hour
os.environ['SBATCH_PARTITION']='parallel' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)

INFO:sparkhpc.sparkjob:Submitted batch job 2627228

INFO:sparkhpc.sparkjob:Submitted cluster 1


In [0]:
news_data = sqlCtx.read.csv('Fake News Data set.csv',inferSchema=True)
news_data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: integer (nullable = true)



In [0]:
news_data = news_data.withColumnRenamed('_c0','claim').withColumnRenamed('_c1','claimant').withColumnRenamed('_c2','article_content').withColumnRenamed('_c3','label')
news_data.show()

+--------------------+-----------------+--------------------+-----+
|               claim|         claimant|     article_content|label|
+--------------------+-----------------+--------------------+-----+
|a line from georg...|             null|1984 george orwel...|    0|
|maine legislature...|             null|republican who cr...|    2|
|a 17yearold girl ...|             null|first person to c...|    1|
|in 1988 author ro...|             null|how dangerous is ...|    2|
|when it comes to ...|  Hillary Clinton|remarks on counte...|    2|
|rhode island is a...|Leonidas Raptakis|lis  code of virg...|    2|
|the poorest count...|         Jim Webb|counties in appal...|    1|
|koch industries p...|             null|update confrontin...|    0|
|minnesota michiga...|        Robin Vos|robin vos discuss...|    1|
|fbi uniform crime...|     Nick Schroer|fbi over four tim...|    1|
|pelosi sinks to n...|  Western Journal|pelosi sinks to n...|    0|
|socialist teacher...|             null|r wolfe 

In [0]:
from pyspark.ml import Pipeline 
from pyspark.ml.feature import Word2Vec, Tokenizer, VectorAssembler, StopWordsRemover ,MinMaxScaler

tokenize = Tokenizer(inputCol = 'article_content', outputCol='tokenized')
stopwrd = StopWordsRemover(inputCol='tokenized',outputCol='cleaned')
w2vec = Word2Vec(inputCol='cleaned',outputCol='mnmx')
#minmax = MinMaxScaler(inputCol = 'mnmx',outputCol = 'scaled')
assembler = VectorAssembler(inputCols=['mnmx'],outputCol='features')

pipe = Pipeline(stages=[tokenize,stopwrd,w2vec,assembler])

In [0]:
pipelineFit_1 = pipe.fit(news_data)
dataset_1 = pipelineFit_1.transform(news_data)

In [0]:
training, test = dataset_1.randomSplit(weights=[0.8,0.2],seed=0)

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

input_layers = len(training.select('features').take(1)[0].asDict()['features'])
output_classes = 3
layers = [input_layers, 128, 128, output_classes]

trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

train_model = trainer.fit(training)

In [0]:
from sklearn.metrics import classification_report

prediction = train_model.transform(test)

y_true = prediction.select('label').collect()
y_pred = prediction.select('prediction').collect()

print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.65      0.71      0.68      1477
           1       0.58      0.66      0.62      1287
           2       0.40      0.01      0.01       294

    accuracy                           0.62      3058
   macro avg       0.54      0.46      0.44      3058
weighted avg       0.60      0.62      0.59      3058



In [0]:
input_layers_1 = len(training.select('features').take(1)[0].asDict()['features'])
output_classes_1 = 3
layers_1 = [input_layers_1, 256, 256, output_classes_1]

trainer_1 = MultilayerPerceptronClassifier(maxIter=100, layers=layers_1, blockSize=128, seed=1234)

train_model_1 = trainer_1.fit(training)

In [0]:
from sklearn.metrics import classification_report

prediction_1 = train_model_1.transform(test)

y_true_1 = prediction_1.select('label').collect()
y_pred_1 = prediction_1.select('prediction').collect()

print(classification_report(y_true_1,y_pred_1))

              precision    recall  f1-score   support

           0       0.64      0.71      0.67      1477
           1       0.58      0.64      0.61      1287
           2       0.00      0.00      0.00       294

    accuracy                           0.61      3058
   macro avg       0.41      0.45      0.43      3058
weighted avg       0.55      0.61      0.58      3058



  'precision', 'predicted', average, warn_for)


In [0]:
input_layers = len(training.select('features').take(1)[0].asDict()['features'])
output_classes = 3
layers_2 = [input_layers, 128, 128, 128,output_classes]

trainer_2 = MultilayerPerceptronClassifier(maxIter=100, layers=layers_2, blockSize=128, seed=1234)

train_model_2 = trainer_2.fit(training)

In [0]:
from sklearn.metrics import classification_report

prediction_2 = train_model_2.transform(test)

y_true_2 = prediction_2.select('label').collect()
y_pred_2 = prediction_2.select('prediction').collect()

print(classification_report(y_true_2,y_pred_2))

              precision    recall  f1-score   support

           0       0.64      0.70      0.67      1477
           1       0.57      0.64      0.60      1287
           2       0.00      0.00      0.00       294

    accuracy                           0.61      3058
   macro avg       0.40      0.45      0.42      3058
weighted avg       0.55      0.61      0.58      3058



  'precision', 'predicted', average, warn_for)
