In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NBGridSearchUpscale').getOrCreate()

In [0]:
df = spark.read.csv('CleanedNews.csv/part-00000-e0c20413-d9a2-4ae3-bc41-a77b460c6a58-c000.csv',inferSchema=True)
df = df.withColumnRenamed('_c0','claim').withColumnRenamed('_c1','claimant').withColumnRenamed('_c2','articles').withColumnRenamed('_c3','label')
df.printSchema()

root
 |-- claim: string (nullable = true)
 |-- claimant: string (nullable = true)
 |-- articles: string (nullable = true)
 |-- label: integer (nullable = true)



In [0]:
counts = df.select('label').groupBy('label').count().orderBy('count').collect()
counts

[Row(label=2, count=1696), Row(label=1, count=6451), Row(label=0, count=7408)]

In [0]:
lowestLabel,lowestCount = counts[0]
midLabel,midCount = counts[1]
highLabel,highCount = counts[2]

In [0]:
df_low_upscaled = df.filter(df.label==lowestLabel).sample(withReplacement=True,fraction = highCount/lowestCount)
df_mid_upscaled = df.filter(df.label==midLabel).sample(withReplacement=True,fraction = highCount/midCount)
df_high_upscaled = df.filter(df.label==highLabel)

In [0]:
from functools import reduce
from pyspark.sql import DataFrame
dfs_labelwise = [df_low_upscaled,df_mid_upscaled,df_high_upscaled]
df_balanced = reduce(DataFrame.unionAll, dfs_labelwise)

In [0]:
df_balanced.printSchema()

root
 |-- claim: string (nullable = true)
 |-- claimant: string (nullable = true)
 |-- articles: string (nullable = true)
 |-- label: integer (nullable = true)



In [0]:
df_balanced.count()

22199

In [0]:
counts = df_balanced.select('label').groupBy('label').count().orderBy('count').collect()
counts

[Row(label=2, count=7329), Row(label=0, count=7408), Row(label=1, count=7462)]

In [0]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF#,StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol='articles',outputCol='token_text')
stop_remove = StopWordsRemover(inputCol='token_text',outputCol='stop_token')
count_vec = CountVectorizer(inputCol='stop_token',outputCol='c_vec')
idf = IDF(inputCol='c_vec',outputCol='tf_idf')
assembler = VectorAssembler(inputCols=['tf_idf'],outputCol='features')

pipe = Pipeline(stages=[tokenizer,stop_remove,count_vec,idf,assembler])
pipelineFit = pipe.fit(df_balanced)
dataset = pipelineFit.transform(df_balanced)

In [0]:
training,test = dataset.randomSplit(weights = [0.8,0.2],seed = 0 )

In [0]:
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

nb = NaiveBayes()

gridSearch = ParamGridBuilder().addGrid(nb.smoothing,[0.0,0.2,0.4,0.6,0.8,1.0]).build()
cvEvaluater = MulticlassClassificationEvaluator(metricName="weightedPrecision",predictionCol="prediction")

cv = CrossValidator(estimator=nb,estimatorParamMaps=gridSearch,evaluator=cvEvaluater)
cvModel = cv.fit(training)

In [0]:
cvModel.avgMetrics

[0.8348999722946979,
 0.7606120858663696,
 0.7551767121020323,
 0.7514207964139287,
 0.7492327754384247,
 0.7469183329377826]

In [0]:
from sklearn.metrics import classification_report
prediction = cvModel.transform(test)
y_true = prediction.select('label').collect()
y_pred = prediction.select('prediction').collect()
print (classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.95      0.80      1478
           1       0.91      0.62      0.74      1510
           2       0.98      0.93      0.96      1497

    accuracy                           0.83      4485
   macro avg       0.86      0.84      0.83      4485
weighted avg       0.86      0.83      0.83      4485

