![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/open-source-nlp/05.1.Text_Classification_Examples_in_SparkML_SparkNLP.ipynb)

# Text Classification with Spark NLP

In [None]:
! pip install -q pyspark==3.4.1 spark-nlp==5.3.2

In [None]:
import os
import sys

import sparknlp

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

import pandas as pd

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  5.3.2
Apache Spark version:  3.4.1


In [None]:
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv

In [None]:
# newsDF = spark.read.parquet("data/news_category.parquet") >> if it is a parquet

newsDF = spark.read \
      .option("header", True) \
      .csv("news_category_train.csv")

newsDF.show(truncate=50)

+--------+--------------------------------------------------+
|category|                                       description|
+--------+--------------------------------------------------+
|Business| Short sellers, Wall Street's dwindling band of...|
|Business| Private investment firm Carlyle Group, which h...|
|Business| Soaring crude prices plus worries about the ec...|
|Business| Authorities have halted oil export flows from ...|
|Business| Tearaway world oil prices, toppling records an...|
|Business| Stocks ended slightly higher on Friday but sta...|
|Business| Assets of the nation's retail money market mut...|
|Business| Retail sales bounced back a bit in July, and n...|
|Business|" After earning a PH.D. in Sociology, Danny Baz...|
|Business| Short sellers, Wall Street's dwindling  band o...|
|Business| Soaring crude prices plus worries  about the e...|
|Business| OPEC can do nothing to douse scorching  oil pr...|
|Business| Non OPEC oil exporters should consider  increa...|
|Busines

In [None]:
newsDF.take(2)

[Row(category='Business', description=" Short sellers, Wall Street's dwindling band of ultra cynics, are seeing green again."),
 Row(category='Business', description=' Private investment firm Carlyle Group, which has a reputation for making well timed and occasionally controversial plays in the defense industry, has quietly placed its bets on another part of the market.')]

In [None]:
from pyspark.sql.functions import col

newsDF.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|   World|30000|
|Sci/Tech|30000|
|  Sports|30000|
|Business|30000|
+--------+-----+



## Building Classification Pipeline

### LogReg with CountVectorizer

Tokenizer: Tokenization

stopwordsRemover: Remove Stop Words

countVectors: Count vectors (“document-term vectors”)

In [None]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

In [None]:
%%time

document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        stemmer,
        finisher,
        countVectors,
        label_stringIdx
])

nlp_model = nlp_pipeline.fit(newsDF)

processed = nlp_model.transform(newsDF)

processed.count()

CPU times: user 314 ms, sys: 52.1 ms, total: 366 ms
Wall time: 37.7 s


120000

In [None]:
processed.select('description','token_features').show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                       description|                                    token_features|
+--------------------------------------------------+--------------------------------------------------+
| Short sellers, Wall Street's dwindling band of...|[short, seller, wall, street, dwindl, band, ult...|
| Private investment firm Carlyle Group, which h...|[privat, invest, firm, carlyl, group, reput, ma...|
| Soaring crude prices plus worries about the ec...|[soar, crude, price, plu, worri, economi, outlo...|
| Authorities have halted oil export flows from ...|[author, halt, oil, export, flow, main, pipelin...|
| Tearaway world oil prices, toppling records an...|[tearawai, world, oil, price, toppl, record, st...|
| Stocks ended slightly higher on Friday but sta...|[stock, end, slightli, higher, fridai, staye, n...|
| Assets of the nation's retail money market mut...|[asset, nati

In [None]:
processed.select('token_features').take(2)

[Row(token_features=['short', 'seller', 'wall', 'street', 'dwindl', 'band', 'ultra', 'cynic', 'see', 'green']),
 Row(token_features=['privat', 'invest', 'firm', 'carlyl', 'group', 'reput', 'make', 'well', 'time', 'occasion', 'controversi', 'plai', 'defens', 'industri', 'quietli', 'place', 'bet', 'anoth', 'part', 'market'])]

In [None]:
processed.select('features').take(2)

[Row(features=SparseVector(10000, {241: 1.0, 384: 1.0, 467: 1.0, 745: 1.0, 838: 1.0, 2227: 1.0, 3678: 1.0, 6131: 1.0, 6250: 1.0})),
 Row(features=SparseVector(10000, {26: 1.0, 38: 1.0, 46: 1.0, 68: 1.0, 117: 1.0, 155: 1.0, 182: 1.0, 197: 1.0, 246: 1.0, 304: 1.0, 320: 1.0, 407: 1.0, 428: 1.0, 621: 1.0, 867: 1.0, 2363: 1.0, 2829: 1.0, 2859: 1.0, 6868: 1.0}))]

In [None]:
processed.select('description','features','label').show()

+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|(10000,[241,384,4...|  0.0|
| Private investme...|(10000,[26,38,46,...|  0.0|
| Soaring crude pr...|(10000,[15,28,46,...|  0.0|
| Authorities have...|(10000,[0,32,35,4...|  0.0|
| Tearaway world o...|(10000,[1,2,11,28...|  0.0|
| Stocks ended sli...|(10000,[3,13,14,2...|  0.0|
| Assets of the na...|(10000,[0,4,10,15...|  0.0|
| Retail sales bou...|(10000,[0,1,10,15...|  0.0|
|" After earning a...|(10000,[98,99,125...|  0.0|
| Short sellers, W...|(10000,[241,384,4...|  0.0|
| Soaring crude pr...|(10000,[15,28,46,...|  0.0|
| OPEC can do noth...|(10000,[0,24,28,2...|  0.0|
| Non OPEC oil exp...|(10000,[0,21,28,3...|  0.0|
| WASHINGTON/NEW Y...|(10000,[2,4,13,14...|  0.0|
| The dollar tumbl...|(10000,[2,14,72,1...|  0.0|
|If you think you ...|(10000,[74,76,143...|  0.0|
|The purchasing po...|(10000,[46,54,167...|  0.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 83967
Test Dataset Count: 36033


In [None]:
trainingData.printSchema()

root
 |-- category: string (nullable = true)
 |-- description: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |   

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|" The major stock indexes e...|Business|[0.999091873162891,2.262691...|  0.0|       0.0|
|" U.S. stocks ended lower o...|Business|[0.9959067648492996,0.00142...|  0.0|       0.0|
| Consumer prices rose by a ...|   World|[0.995775953350137,0.001368...|  3.0|       0.0|
| Disappointing retail sales...|   World|[0.9954731744653694,0.00257...|  3.0|       0.0|
| Why do mutual funds charge...|Business|[0.9954269201581861,0.00325...|  0.0|       0.0|
|Banknorth Group's largest s...|Business|[0.9953602482231324,0.00227...|  0.0|       0.0|
|The absolute price of oil h...|Sci/Tech|[0.994984659975905,1.901799...|  1.0|       0.0|
|" Sky high oil prices are l...|Business|[0.9947590437405693,0.00250...|  0.0|       0.0|
|" U.S. st

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

evaluator.evaluate(predictions)

0.8989373633709627

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_true = predictions.select("label")
y_true = y_true.toPandas()

y_pred = predictions.select("prediction")
y_pred = y_pred.toPandas()

In [None]:
y_pred.prediction.value_counts()

1.0    9222
2.0    9134
0.0    9105
3.0    8572
Name: prediction, dtype: int64

In [None]:
cnf_matrix = confusion_matrix(list(y_true.label.astype(int)), list(y_pred.prediction.astype(int)))
cnf_matrix

array([[7874,  845,   84,  328],
       [ 716, 7992,   93,  290],
       [  48,   90, 8660,   82],
       [ 467,  295,  297, 7872]])

In [None]:
print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.86      0.86      0.86      9131
         1.0       0.87      0.88      0.87      9091
         2.0       0.95      0.98      0.96      8880
         3.0       0.92      0.88      0.90      8931

    accuracy                           0.90     36033
   macro avg       0.90      0.90      0.90     36033
weighted avg       0.90      0.90      0.90     36033

0.8991202508811368


### LogReg with TFIDF

In [None]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        stemmer,
        finisher,
        hashingTF,
        idf,
        label_stringIdx
])

nlp_model_tf = nlp_pipeline_tf.fit(newsDF)

processed_tf = nlp_model_tf.transform(newsDF)

processed_tf.count()


120000

In [None]:
# set seed for reproducibility
processed_tf.select('description','features','label').show()

+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|(10000,[551,621,6...|  0.0|
| Private investme...|(10000,[157,831,9...|  0.0|
| Soaring crude pr...|(10000,[793,1738,...|  0.0|
| Authorities have...|(10000,[1548,1611...|  0.0|
| Tearaway world o...|(10000,[323,585,1...|  0.0|
| Stocks ended sli...|(10000,[453,609,6...|  0.0|
| Assets of the na...|(10000,[258,444,1...|  0.0|
| Retail sales bou...|(10000,[14,585,19...|  0.0|
|" After earning a...|(10000,[114,796,1...|  0.0|
| Short sellers, W...|(10000,[551,621,6...|  0.0|
| Soaring crude pr...|(10000,[793,1738,...|  0.0|
| OPEC can do noth...|(10000,[298,616,9...|  0.0|
| Non OPEC oil exp...|(10000,[616,1063,...|  0.0|
| WASHINGTON/NEW Y...|(10000,[360,832,1...|  0.0|
| The dollar tumbl...|(10000,[419,949,1...|  0.0|
|If you think you ...|(10000,[1041,2059...|  0.0|
|The purchasing po...|(10000,[901,2198,...|  0.0|


In [None]:
(trainingData, testData) = processed_tf.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 83967
Test Dataset Count: 36033


In [None]:
lrModel_tf = lr.fit(trainingData)

predictions_tf = lrModel_tf.transform(testData)

predictions_tf.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|" The major stock indexes e...|Business|[0.9985997794719548,5.73025...|  0.0|       0.0|
|The board overseeing MFS In...|Business|[0.9969443330650403,0.00241...|  0.0|       0.0|
|Former Enron Corp. executiv...|Business|[0.9964788037536277,0.00133...|  0.0|       0.0|
| Disappointing retail sales...|   World|[0.9961471989168428,0.00187...|  3.0|       0.0|
|" U.S. stocks ended lower o...|Business|[0.9950907314585086,0.00274...|  0.0|       0.0|
|" Mid priced clothing retai...|Business|[0.9938272821165663,0.00197...|  0.0|       0.0|
| Consumer prices rose by a ...|   World|[0.993651814252647,0.002007...|  3.0|       0.0|
| Oil prices reached a new h...|   World|[0.9930688447636813,7.14155...|  3.0|       0.0|
|" Sears, 

In [None]:
y_true = predictions_tf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_tf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85      9131
         1.0       0.85      0.86      0.86      9091
         2.0       0.93      0.96      0.95      8880
         3.0       0.90      0.87      0.89      8931

    accuracy                           0.89     36033
   macro avg       0.89      0.89      0.89     36033
weighted avg       0.89      0.89      0.89     36033

0.8855771098715067


### Random Forest with TFIDF

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions_rf = rfModel.transform(testData)


In [None]:
predictions_rf.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| Japan's Nikkei average ros...|Business|[0.387888600873362,0.221694...|  0.0|       0.0|
|" Amazon.com Inc. &lt;A HRE...|Business|[0.38669064250683755,0.2342...|  0.0|       0.0|
|US stocks opened higher on ...|Business|[0.3847050771462904,0.22817...|  0.0|       0.0|
|" U.S. stocks ended lower o...|Business|[0.3817499960596381,0.23254...|  0.0|       0.0|
| Investors bid stocks highe...|   World|[0.3773107429507354,0.24689...|  3.0|       0.0|
| Best Buy Company Inc., the...|Business|[0.3762196991895761,0.25226...|  0.0|       0.0|
|Nut and snack-food company ...|Business|[0.3746249487873475,0.25013...|  0.0|       0.0|
|Crude oil prices hovered ne...|Business|[0.3738391266127697,0.21327...|  0.0|       0.0|
|" Blue ch

In [None]:
y_true = predictions_rf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_rf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.76      0.67      0.71      9131
         1.0       0.81      0.59      0.68      9091
         2.0       0.66      0.92      0.77      8880
         3.0       0.75      0.76      0.76      8931

    accuracy                           0.73     36033
   macro avg       0.75      0.73      0.73     36033
weighted avg       0.75      0.73      0.73     36033

0.7329392501318236


## LogReg with Spark NLP Glove Word Embeddings

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

glove_embeddings = WordEmbeddingsModel().pretrained() \
    .setInputCols(["document",'cleanTokens'])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
    "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_w2v = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        glove_embeddings,
        embeddingsSentence,
        embeddings_finisher,
        explodeVectors,
        label_stringIdx
])

nlp_model_w2v = nlp_pipeline_w2v.fit(newsDF)

processed_w2v = nlp_model_w2v.transform(newsDF)

processed_w2v.count()


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


120000

In [None]:
processed_w2v.columns

['features',
 'category',
 'description',
 'document',
 'token',
 'normalized',
 'cleanTokens',
 'embeddings',
 'sentence_embeddings',
 'finished_sentence_embeddings',
 'label']

In [None]:
processed_w2v.show(5)

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+-----+
|            features|category|         description|            document|               token|          normalized|         cleanTokens|          embeddings| sentence_embeddings|finished_sentence_embeddings|label|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+-----+
|[-0.1556767076253...|Business| Short sellers, W...|[{document, 0, 84...|[{token, 1, 5, Sh...|[{token, 1, 5, Sh...|[{token, 1, 5, Sh...|[{word_embeddings...|[{sentence_embedd...|        [[-0.155676707625...|  0.0|
|[-0.0144653050228...|Business| Private investme...|[{document, 0, 20...|[{token, 1, 7, Pr...|[{token, 1, 7, Pr...|[{token, 1, 7, Pr...|[{word_e

In [None]:
processed_w2v.select('finished_sentence_embeddings').take(1)

[Row(finished_sentence_embeddings=[DenseVector([-0.1557, 0.196, 0.1099, -0.3089, 0.16, 0.1672, -0.4649, -0.1101, -0.053, -0.1551, 0.0327, 0.0772, 0.1494, -0.1865, 0.1155, -0.0597, 0.0234, -0.0451, 0.2361, -0.0089, 0.3358, 0.0444, 0.0088, -0.1453, 0.2289, 0.0914, -0.1665, -0.3726, 0.1892, 0.121, 0.1993, -0.0239, -0.1346, 0.1159, 0.2086, 0.1285, 0.068, 0.1372, 0.3153, -0.1934, 0.0257, -0.226, -0.0984, 0.1139, 0.1413, -0.3743, 0.072, 0.1403, 0.251, -0.3106, 0.1709, -0.0697, -0.0554, 0.5123, -0.1873, -1.7784, 0.0295, 0.1014, 0.9268, 0.2129, -0.1354, 0.5739, -0.0679, 0.461, 0.4216, 0.0225, 0.4456, -0.2462, 0.1411, -0.3258, 0.0025, 0.0114, -0.3895, -0.1106, -0.261, 0.0147, 0.0781, 0.1268, -0.2042, -0.2278, 0.5096, 0.1539, -0.3515, -0.0102, -0.7003, -0.3872, -0.1668, -0.2405, -0.0766, 0.1396, -0.0592, -0.1568, -0.1606, -0.1371, -0.684, -0.2549, -0.1541, 0.1536, 0.2715, 0.3342])])]

In [None]:
# IF SQLTransformer IS NOT USED INSIDE THE PIPELINE, WE CAN EXPLODE OUTSIDE
from pyspark.sql.functions import explode

# processed_w2v= processed_w2v.withColumn("features", explode(processed_w2v.finished_sentence_embeddings))

In [None]:
processed_w2v.select("features").take(1)

[Row(features=DenseVector([-0.1557, 0.196, 0.1099, -0.3089, 0.16, 0.1672, -0.4649, -0.1101, -0.053, -0.1551, 0.0327, 0.0772, 0.1494, -0.1865, 0.1155, -0.0597, 0.0234, -0.0451, 0.2361, -0.0089, 0.3358, 0.0444, 0.0088, -0.1453, 0.2289, 0.0914, -0.1665, -0.3726, 0.1892, 0.121, 0.1993, -0.0239, -0.1346, 0.1159, 0.2086, 0.1285, 0.068, 0.1372, 0.3153, -0.1934, 0.0257, -0.226, -0.0984, 0.1139, 0.1413, -0.3743, 0.072, 0.1403, 0.251, -0.3106, 0.1709, -0.0697, -0.0554, 0.5123, -0.1873, -1.7784, 0.0295, 0.1014, 0.9268, 0.2129, -0.1354, 0.5739, -0.0679, 0.461, 0.4216, 0.0225, 0.4456, -0.2462, 0.1411, -0.3258, 0.0025, 0.0114, -0.3895, -0.1106, -0.261, 0.0147, 0.0781, 0.1268, -0.2042, -0.2278, 0.5096, 0.1539, -0.3515, -0.0102, -0.7003, -0.3872, -0.1668, -0.2405, -0.0766, 0.1396, -0.0592, -0.1568, -0.1606, -0.1371, -0.684, -0.2549, -0.1541, 0.1536, 0.2715, 0.3342]))]

In [None]:
processed_w2v.select("features").take(1)

[Row(features=DenseVector([-0.1557, 0.196, 0.1099, -0.3089, 0.16, 0.1672, -0.4649, -0.1101, -0.053, -0.1551, 0.0327, 0.0772, 0.1494, -0.1865, 0.1155, -0.0597, 0.0234, -0.0451, 0.2361, -0.0089, 0.3358, 0.0444, 0.0088, -0.1453, 0.2289, 0.0914, -0.1665, -0.3726, 0.1892, 0.121, 0.1993, -0.0239, -0.1346, 0.1159, 0.2086, 0.1285, 0.068, 0.1372, 0.3153, -0.1934, 0.0257, -0.226, -0.0984, 0.1139, 0.1413, -0.3743, 0.072, 0.1403, 0.251, -0.3106, 0.1709, -0.0697, -0.0554, 0.5123, -0.1873, -1.7784, 0.0295, 0.1014, 0.9268, 0.2129, -0.1354, 0.5739, -0.0679, 0.461, 0.4216, 0.0225, 0.4456, -0.2462, 0.1411, -0.3258, 0.0025, 0.0114, -0.3895, -0.1106, -0.261, 0.0147, 0.0781, 0.1268, -0.2042, -0.2278, 0.5096, 0.1539, -0.3515, -0.0102, -0.7003, -0.3872, -0.1668, -0.2405, -0.0766, 0.1396, -0.0592, -0.1568, -0.1606, -0.1371, -0.684, -0.2549, -0.1541, 0.1536, 0.2715, 0.3342]))]

In [None]:
processed_w2v.select('description','features','label').show()


+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|[-0.1556767076253...|  0.0|
| Private investme...|[-0.0144653050228...|  0.0|
| Soaring crude pr...|[0.10348732769489...|  0.0|
| Authorities have...|[-0.0355810523033...|  0.0|
| Tearaway world o...|[0.00647281948477...|  0.0|
| Stocks ended sli...|[0.20069395005702...|  0.0|
| Assets of the na...|[0.38012433052062...|  0.0|
| Retail sales bou...|[0.20352847874164...|  0.0|
|" After earning a...|[0.13536226749420...|  0.0|
| Short sellers, W...|[-0.1556767076253...|  0.0|
| Soaring crude pr...|[0.10348732769489...|  0.0|
| OPEC can do noth...|[0.20307321846485...|  0.0|
| Non OPEC oil exp...|[0.09010648727416...|  0.0|
| WASHINGTON/NEW Y...|[0.10887209326028...|  0.0|
| The dollar tumbl...|[0.05723679438233...|  0.0|
|If you think you ...|[0.11463439464569...|  0.0|
|The purchasing po...|[0.05890964344143...|  0.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed_w2v.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 83967
Test Dataset Count: 36033


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

testData = testData.where(num_nonzeros("features") != 0)

In [None]:
lrModel_w2v = lr.fit(trainingData)

In [None]:
predictions_w2v = lrModel_w2v.transform(testData)

predictions_w2v.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|THE stock market is rising,...|Business|[0.9897492276739599,0.00522...|  0.0|       0.0|
| Stocks fell on Tuesday as ...|Business|[0.9886070356102349,0.00579...|  0.0|       0.0|
|Swiss banking giant UBS AG ...|Business|[0.9885750808549393,0.00622...|  0.0|       0.0|
|Tokyo stocks plunged Tuesda...|Business|[0.9843279839213964,0.00453...|  0.0|       0.0|
| Financial services company...|Business|[0.9841911568755125,0.00924...|  0.0|       0.0|
|Hutchison Whampoa said it w...|Business|[0.9815622819151237,0.01703...|  0.0|       0.0|
| Citigroup Inc.  on Thursda...|Business|[0.9814307059494389,0.01481...|  0.0|       0.0|
|" Stocks fell sharply on We...|Business|[0.9806525515660117,0.00859...|  0.0|       0.0|
|Newmont M

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

y_true = predictions_w2v.select("label")
y_true = y_true.toPandas()

y_pred = predictions_w2v.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.82      0.82      0.82      8976
         1.0       0.83      0.81      0.82      9034
         2.0       0.93      0.96      0.94      8972
         3.0       0.88      0.87      0.87      9050

    accuracy                           0.87     36032
   macro avg       0.86      0.87      0.87     36032
weighted avg       0.86      0.87      0.87     36032

0.8655916962699822


In [None]:
processed_w2v.select('description','cleanTokens.result').show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                       description|                                            result|
+--------------------------------------------------+--------------------------------------------------+
| Short sellers, Wall Street's dwindling band of...|[Short, sellers, Wall, Streets, dwindling, band...|
| Private investment firm Carlyle Group, which h...|[Private, investment, firm, Carlyle, Group, rep...|
| Soaring crude prices plus worries about the ec...|[Soaring, crude, prices, plus, worries, economy...|
| Authorities have halted oil export flows from ...|[Authorities, halted, oil, export, flows, main,...|
| Tearaway world oil prices, toppling records an...|[Tearaway, world, oil, prices, toppling, record...|
| Stocks ended slightly higher on Friday but sta...|[Stocks, ended, slightly, higher, Friday, staye...|
| Assets of the nation's retail money market mut...|[Assets, nat

## LogReg with Spark NLP Bert Embeddings

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

bert_embeddings = BertEmbeddings.pretrained('bert_base_cased', 'en') \
    .setInputCols(["document",'cleanTokens'])\
    .setOutputCol("bert")\
    .setCaseSensitive(False)\

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "bert"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_bert = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        bert_embeddings,
        embeddingsSentence,
        embeddings_finisher,
        label_stringIdx
])

bert_base_cased download started this may take some time.
Approximate size to download 384.9 MB
[OK!]


In [None]:
%%time
limited_df = newsDF.limit(10000)

nlp_model_bert = nlp_pipeline_bert.fit(limited_df)

processed_bert = nlp_model_bert.transform(limited_df)

processed_bert.count()

CPU times: user 3.75 s, sys: 503 ms, total: 4.25 s
Wall time: 11min 53s


10000

In [None]:
from pyspark.sql.functions import explode

processed_bert = processed_bert.withColumn("features", explode(processed_bert.finished_sentence_embeddings))

processed_bert.select('description','features','label').show()


+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|[-0.0012148499954...|  2.0|
| Private investme...|[0.13144001364707...|  2.0|
| Soaring crude pr...|[-0.1905521452426...|  2.0|
| Authorities have...|[0.06882464885711...|  2.0|
| Tearaway world o...|[-0.1174714937806...|  2.0|
| Stocks ended sli...|[-0.0321817547082...|  2.0|
| Assets of the na...|[-0.2906664609909...|  2.0|
| Retail sales bou...|[-0.0385284274816...|  2.0|
|" After earning a...|[-0.0362812764942...|  2.0|
| Short sellers, W...|[-0.0012148499954...|  2.0|
| Soaring crude pr...|[-0.1905521452426...|  2.0|
| OPEC can do noth...|[-0.1431128382682...|  2.0|
| Non OPEC oil exp...|[0.01600184850394...|  2.0|
| WASHINGTON/NEW Y...|[0.14494352042675...|  2.0|
| The dollar tumbl...|[-0.1958881467580...|  2.0|
|If you think you ...|[0.27292791008949...|  2.0|
|The purchasing po...|[0.00386757543310...|  2.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed_bert.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 7033
Test Dataset Count: 2967


In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

testData = testData.where(num_nonzeros("features") != 0)

In [None]:
predictions = lrModel.transform(testData)

predictions.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|Wise Solutions has released...|Sci/Tech|[0.9972450751236089,2.61175...|  0.0|       0.0|
|Microsoft has a massive pat...|Sci/Tech|[0.9969462971935305,0.00172...|  0.0|       0.0|
|Microsoft Corp. has publish...|Sci/Tech|[0.996902163500599,4.952027...|  0.0|       0.0|
|A worm that has the capabil...|Sci/Tech|[0.9962258931382776,9.54592...|  0.0|       0.0|
|Microsoft has made availabl...|Sci/Tech|[0.9960325952969382,5.09907...|  0.0|       0.0|
|Release makes use of techno...|Sci/Tech|[0.9960054388386421,3.32463...|  0.0|       0.0|
|Macromedia hopes to boost u...|Sci/Tech|[0.9957959616421305,0.00123...|  0.0|       0.0|
|Software giant releases jus...|Sci/Tech|[0.9954924923514694,3.43349...|  0.0|       0.0|
| Sleepyca

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

df = predictions.select('description','category','label','prediction').toPandas()

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.84      0.84      0.84       796
         1.0       0.87      0.83      0.85       726
         2.0       0.81      0.80      0.81       738
         3.0       0.89      0.95      0.92       707

    accuracy                           0.85      2967
   macro avg       0.85      0.85      0.85      2967
weighted avg       0.85      0.85      0.85      2967

0.8533872598584429


## LogReg with ELMO Embeddings

In [None]:
%%time

document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

elmo_embeddings = ElmoEmbeddings.pretrained()\
    .setPoolingLayer("word_emb")\
    .setInputCols(["document",'cleanTokens'])\
    .setOutputCol("elmo")

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "elmo"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_elmo = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        elmo_embeddings,
        embeddingsSentence,
        embeddings_finisher,
        label_stringIdx
])

nlp_model_elmo = nlp_pipeline_elmo.fit(newsDF)

processed_elmo = nlp_model_elmo.transform(newsDF)

processed_elmo.count()


elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]
CPU times: user 323 ms, sys: 53.5 ms, total: 377 ms
Wall time: 54.8 s


120000

In [None]:
(trainingData, testData) = newsDF.randomSplit([0.7, 0.3], seed = 100)

In [None]:
processed_trainingData = nlp_model_elmo.transform(trainingData)

processed_trainingData.count()

83967

In [None]:
processed_testData = nlp_model_elmo.transform(testData)

processed_testData.count()

36033

In [None]:
processed_trainingData.columns

['category',
 'description',
 'document',
 'token',
 'normalized',
 'cleanTokens',
 'elmo',
 'sentence_embeddings',
 'finished_sentence_embeddings',
 'label']

In [None]:
from pyspark.sql.functions import explode

processed_testData= processed_testData.withColumn("features", explode(processed_testData.finished_sentence_embeddings))

processed_trainingData= processed_trainingData.withColumn("features", explode(processed_trainingData.finished_sentence_embeddings))

In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

processed_testData = processed_testData.where(num_nonzeros("features") != 0)

In [None]:
%%time

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(processed_trainingData)


CPU times: user 7.38 s, sys: 869 ms, total: 8.25 s
Wall time: 19min 12s


In [None]:
processed_trainingData.columns

['category',
 'description',
 'document',
 'token',
 'normalized',
 'cleanTokens',
 'elmo',
 'sentence_embeddings',
 'finished_sentence_embeddings',
 'label',
 'features']

In [None]:
predictions = lrModel.transform(processed_testData)

predictions.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|" Costco Wholesale Corp. &l...|Business|[0.995254029013999,0.003926...|  0.0|       0.0|
|" Exxon Mobil Corp. &lt;A H...|Business|[0.9935440409389442,0.00405...|  0.0|       0.0|
|" Exxon Mobil Corp. &lt;A H...|Business|[0.9935440409389442,0.00405...|  0.0|       0.0|
| Falling oil prices and str...|   World|[0.9930267682263293,0.00645...|  3.0|       0.0|
| ChevronTexaco Corp., the N...|Business|[0.9928294172661469,0.00535...|  0.0|       0.0|
| U.S. blue chips fell on We...|Business|[0.9919443128183406,0.00567...|  0.0|       0.0|
| Kmart Holding Corporation,...|Business|[0.9914027294702593,0.00687...|  0.0|       0.0|
|" Halliburton Co. &lt;A HRE...|Business|[0.991119081571484,0.004886...|  0.0|       0.0|
|" Hallibu

In [None]:
df = predictions.select('description','category','label','prediction').toPandas()

In [None]:
df.shape

(36033, 4)

In [None]:
df.head()

Unnamed: 0,description,category,label,prediction
0,A drop in oil prices and upbeat outlooks fro...,Business,0.0,0.0
1,Air Canada creditors including a General Ele...,Business,0.0,0.0
2,AirTran (AAI) said late Thursday it regrets ...,Business,0.0,3.0
3,Although published reports yesterday claimed...,Business,0.0,0.0
4,Americans paid their credit card bills on ti...,Business,0.0,0.0


In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83      9131
         1.0       0.84      0.83      0.83      9091
         2.0       0.93      0.96      0.94      8880
         3.0       0.88      0.87      0.87      8931

    accuracy                           0.87     36033
   macro avg       0.87      0.87      0.87     36033
weighted avg       0.87      0.87      0.87     36033

0.870008048178059


## LogReg with Universal Sentence Encoder

In [None]:
useEmbeddings = UniversalSentenceEncoder.pretrained()\
      .setInputCols("document")\
      .setOutputCol("use_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")

loaded_useEmbeddings = UniversalSentenceEncoder.load('/root/cache_pretrained/tfhub_use_en_2.4.0_2.4_1587136330099')\
    .setInputCols("document")\
    .setOutputCol("use_embeddings")

embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["use_embeddings"]) \
    .setOutputCols(["finished_use_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

use_pipeline = Pipeline(
    stages=[
        document_assembler,
        loaded_useEmbeddings,
        embeddings_finisher,
        label_stringIdx
])

use_df = use_pipeline.fit(newsDF).transform(newsDF)

In [None]:
use_df.select('finished_use_embeddings').show(3)

+-----------------------+
|finished_use_embeddings|
+-----------------------+
|   [[0.0441501699388...|
|   [[0.0844451263546...|
|   [[0.0426647029817...|
+-----------------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import explode

use_df= use_df.withColumn("features", explode(use_df.finished_use_embeddings))

In [None]:
use_df.show(2)

+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|category|         description|            document|      use_embeddings|finished_use_embeddings|label|            features|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|Business| Short sellers, W...|[{document, 0, 84...|[{sentence_embedd...|   [[0.0441501699388...|  0.0|[0.04415016993880...|
|Business| Private investme...|[{document, 0, 20...|[{sentence_embedd...|   [[0.0844451263546...|  0.0|[0.08444512635469...|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
only showing top 2 rows



In [None]:
# set seed for reproducibility
(trainingData, testData) = use_df.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 83967
Test Dataset Count: 36033


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|Amid talk of a possible liq...|Business|[0.9838418958725589,0.00641...|  0.0|       0.0|
|Albertsons Inc., one of the...|Business|[0.9828145512629816,0.00860...|  0.0|       0.0|
|Prudential Financial Inc., ...|Business|[0.9821409995425612,0.01061...|  0.0|       0.0|
|Fannie Mae, the largest US ...|Business|[0.9820438961267436,0.00970...|  0.0|       0.0|
|" Stocks were seen opening ...|Business|[0.9820075348032326,0.00546...|  0.0|       0.0|
| U.S. blue chip stocks fell...|Business|[0.9810072366995876,0.01046...|  0.0|       0.0|
|Financial services company ...|Business|[0.9806877870270011,0.00947...|  0.0|       0.0|
|ExxonMobil Corp. and Royal ...|Business|[0.9806061001659467,0.00650...|  0.0|       0.0|
|Finance f

In [None]:
df = predictions.select('description','category','label','prediction').toPandas()
#df['result'] = df['result'].apply(lambda x: x[0])

In [None]:
df.head()

Unnamed: 0,description,category,label,prediction
0,A drop in oil prices and upbeat outlooks fro...,Business,0.0,0.0
1,Air Canada creditors including a General Ele...,Business,0.0,0.0
2,AirTran (AAI) said late Thursday it regrets ...,Business,0.0,0.0
3,Although published reports yesterday claimed...,Business,0.0,0.0
4,Americans paid their credit card bills on ti...,Business,0.0,0.0


In [None]:
print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.84      0.83      9131
         1.0       0.85      0.84      0.85      9091
         2.0       0.95      0.97      0.96      8880
         3.0       0.89      0.88      0.89      8931

    accuracy                           0.88     36033
   macro avg       0.88      0.88      0.88     36033
weighted avg       0.88      0.88      0.88     36033

0.8814142591513335


### train on entire dataset

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(use_df)

In [None]:
test_df = spark.read.csv("./news_category_test.csv",header=True)

In [None]:
test_df = use_pipeline.fit(test_df).transform(test_df)

In [None]:
test_df= test_df.withColumn("features", explode(test_df.finished_use_embeddings))

In [None]:
test_df.show(2)

+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|category|         description|            document|      use_embeddings|finished_use_embeddings|label|            features|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|Business|Unions representi...|[{document, 0, 12...|[{sentence_embedd...|   [[0.0129975201562...|  0.0|[0.01299752015620...|
|Sci/Tech| TORONTO, Canada ...|[{document, 0, 22...|[{sentence_embedd...|   [[0.0019998527131...|  1.0|[0.00199985271319...|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
only showing top 2 rows



In [None]:
from pyspark.sql.functions import col

test_df.groupBy("category","label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+-----+
|category|label|count|
+--------+-----+-----+
|Sci/Tech|  1.0| 1900|
|  Sports|  2.0| 1900|
|   World|  3.0| 1900|
|Business|  0.0| 1900|
+--------+-----+-----+



In [None]:
predictions = lrModel.transform(test_df)

In [None]:
df = predictions.select('description','category','label','prediction').toPandas()

In [None]:
df['label'] = df.category.replace({'World':3.0,
                    'Sports':2.0,
                    'Business':0.0,
                    'Sci/Tech':1.0})

In [None]:
df.head()

Unnamed: 0,description,category,label,prediction
0,Unions representing workers at Turner Newall...,Business,0.0,0.0
1,"TORONTO, Canada A second team of rocketeer...",Sci/Tech,1.0,1.0
2,A company founded by a chemistry researcher a...,Sci/Tech,1.0,1.0
3,It's barely dawn when Mike Fitzpatrick starts...,Sci/Tech,1.0,1.0
4,Southern California's smog fighting agency we...,Sci/Tech,1.0,0.0


In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83      1900
         1.0       0.84      0.85      0.85      1900
         2.0       0.95      0.97      0.96      1900
         3.0       0.90      0.87      0.89      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600

0.8802631578947369


# ClassifierDL

In [None]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.load('/root/cache_pretrained/tfhub_use_en_2.4.0_2.4_1587136330099')\
    .setInputCols("document")\
    .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setMaxEpochs(5)\
    .setEnableOutputLogs(True)

pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
])

In [None]:
# set seed for reproducibility
(trainingData, testData) = newsDF.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 83967
Test Dataset Count: 36033


In [None]:
pipelineModel = pipeline.fit(trainingData)

In [None]:
pipelineModel.stages[2].write().overwrite().save('classifierDL_model_5e')

In [None]:
from sklearn.metrics import classification_report, accuracy_score

df = pipelineModel.transform(testData).select('category','description',"class.result").toPandas()

df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.84      0.87      0.85      9131
    Sci/Tech       0.86      0.87      0.86      9091
      Sports       0.95      0.98      0.96      8880
       World       0.93      0.86      0.89      8931

    accuracy                           0.89     36033
   macro avg       0.89      0.89      0.89     36033
weighted avg       0.89      0.89      0.89     36033

0.8929037271390114


## Loading the trained classifier from disk

In [None]:
import sparknlp
sparknlp.__path__

['/usr/local/lib/python3.10/dist-packages/sparknlp']

In [None]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("news_category_train.csv")

In [None]:
trainDataset.count()

120000

In [None]:
trainingData.count()

83967

In [None]:
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.load('/root/cache_pretrained/tfhub_use_en_2.4.0_2.4_1587136330099')\
    .setInputCols("document")\
    .setOutputCol("sentence_embeddings")

classsifierdlmodel = ClassifierDLModel.load('classifierDL_model_5e')

pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdlmodel
])

In [None]:
pipeline.fit(testData.limit(10)).transform(testData.limit(10)).select('category','description',"class.result").show(10, truncate=50)

+--------+--------------------------------------------------+----------+
|category|                                       description|    result|
+--------+--------------------------------------------------+----------+
|Business|  A drop in oil prices and upbeat outlooks from...|[Business]|
|Business|  Air Canada creditors including a General Elec...|[Business]|
|Business|  AirTran (AAI) said late Thursday it regrets t...|[Business]|
|Business|  Although published reports yesterday claimed ...|[Business]|
|Business|  Americans paid their credit card bills on tim...|[Business]|
|Business|  As Apple Computer shares trade near their hig...|[Sci/Tech]|
|Business|  Asian stocks closed mainly higher Tuesday, le...|[Business]|
|Business|  Bank of America chief executive Ken Lewis wil...|[Business]|
|Business|  Big gains in technology stocks helped the Nas...|[Business]|
|Business|  Cargo operations at the nation's largest port...|[Business]|
+--------+-----------------------------------------

In [None]:
lm = LightPipeline(pipeline.fit(spark.createDataFrame([[""]]).toDF("text")))
lm.annotate('In its first two years, the UK dedicated card companies have surge')

{'document': ['In its first two years, the UK dedicated card companies have surge'],
 'sentence_embeddings': ['In its first two years, the UK dedicated card companies have surge'],
 'class': ['Sci/Tech']}

In [None]:
text='''
Fearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.
'''

In [None]:
lm = LightPipeline(pipeline.fit(spark.createDataFrame([[""]]).toDF("text")))

lm.annotate(text)

{'document': ['\nFearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.\n'],
 'sentence_embeddings': ['\nFearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.\n'],
 'class': ['Business']}

# Classifier DL + Glove + Basic text processing

In [None]:
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
    .setInputCols(["document",'lemma'])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

lemma_pipeline = Pipeline(
    stages=[
        document,
        tokenizer,
        lemma,
        glove_embeddings
])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
lemma_pipeline.fit(trainingData.limit(1000)).transform(trainingData.limit(1000)).show(truncate=30)

+--------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
|category|                   description|                      document|                         token|                         lemma|                    embeddings|
+--------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
|Business|  A  $120 million fine levi...|[{document, 0, 278,   A  $1...|[{token, 2, 2, A, {sentence...|[{token, 2, 2, A, {sentence...|[{word_embeddings, 2, 2, A,...|
|Business|  A Colorado assistant stor...|[{document, 0, 144,   A Col...|[{token, 2, 2, A, {sentence...|[{token, 2, 2, A, {sentence...|[{word_embeddings, 2, 2, A,...|
|Business|  A Pennsylvania brewery is...|[{document, 0, 132,   A Pen...|[{token, 2, 2, A, {sentence...|[{token, 2, 2, A, {sentence...|[{word_embeddings, 2, 2, A,...|
|Bus

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
    .setInputCols(["document",'lemma'])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setMaxEpochs(5)\
    .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        lemma,
        glove_embeddings,
        embeddingsSentence,
        classsifierdl
])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
!rm -rf classifier_dl_pipeline_glove

In [None]:
clf_pipeline.save('classifier_dl_pipeline_glove')

In [None]:
clf_pipelineModel = clf_pipeline.fit(trainingData)

In [None]:
df = clf_pipelineModel.transform(testData).select('category','description',"class.result").toPandas()

df.head()

Unnamed: 0,category,description,result
0,Business,A drop in oil prices and upbeat outlooks fro...,[Business]
1,Business,Air Canada creditors including a General Ele...,[Business]
2,Business,AirTran (AAI) said late Thursday it regrets ...,[Business]
3,Business,Although published reports yesterday claimed...,[Business]
4,Business,Americans paid their credit card bills on ti...,[Business]


In [None]:
from sklearn.metrics import classification_report, accuracy_score

df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))

print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.84      0.85      0.84      9131
    Sci/Tech       0.84      0.86      0.85      9091
      Sports       0.94      0.96      0.95      8880
       World       0.91      0.86      0.89      8931

    accuracy                           0.88     36033
   macro avg       0.88      0.88      0.88     36033
weighted avg       0.88      0.88      0.88     36033

0.8832736658063441


In [None]:
import pandas as pd

In [None]:
news_df = newsDF.toPandas()

In [None]:
news_df.head()

Unnamed: 0,category,description
0,Business,"Short sellers, Wall Street's dwindling band o..."
1,Business,"Private investment firm Carlyle Group, which ..."
2,Business,Soaring crude prices plus worries about the e...
3,Business,Authorities have halted oil export flows from...
4,Business,"Tearaway world oil prices, toppling records a..."


In [None]:
news_df.to_csv('news_dataset.csv', index=False)

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
    .setInputCols(["document",'lemma'])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

txt_pipeline = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        lemma,
        glove_embeddings,
        embeddingsSentence
])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
txt_pipelineModel = txt_pipeline.fit(testData.limit(1))

In [None]:
txt_pipelineModel.save('text_prep_pipeline_glove')