![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)



[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.1_Text_classification_examples_in_SparkML_SparkNLP.ipynb)

# Text Classification with Spark NLP

In [None]:
! pip install -q pyspark==3.2.0 spark-nlp


<b>  if you want to work with Spark 2.3 </b>
```
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz

!tar xf spark-2.3.0-bin-hadoop2.7.tgz
!pip install -q findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/content/spark-2.3.0-bin-hadoop2.7"
! java -version

import findspark
findspark.init()
from pyspark.sql import SparkSession

! pip install --ignore-installed -q spark-nlp==2.7.5

import sparknlp

spark = sparknlp.start(spark23=True)
```

In [1]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import pandas as pd


In [2]:
import sparknlp
spark = sparknlp.start(spark32 = True)

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  3.4.0
Apache Spark version:  3.2.0


In [None]:
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv

In [8]:
# newsDF = spark.read.parquet("data/news_category.parquet") >> if it is a parquet

newsDF = spark.read \
      .option("header", True) \
      .csv("news_category_train.csv")

newsDF.show(truncate=50)

+--------+--------------------------------------------------+
|category|                                       description|
+--------+--------------------------------------------------+
|Business| Short sellers, Wall Street's dwindling band of...|
|Business| Private investment firm Carlyle Group, which h...|
|Business| Soaring crude prices plus worries about the ec...|
|Business| Authorities have halted oil export flows from ...|
|Business| Tearaway world oil prices, toppling records an...|
|Business| Stocks ended slightly higher on Friday but sta...|
|Business| Assets of the nation's retail money market mut...|
|Business| Retail sales bounced back a bit in July, and n...|
|Business|" After earning a PH.D. in Sociology, Danny Baz...|
|Business| Short sellers, Wall Street's dwindling  band o...|
|Business| Soaring crude prices plus worries  about the e...|
|Business| OPEC can do nothing to douse scorching  oil pr...|
|Business| Non OPEC oil exporters should consider  increa...|
|Busines

In [4]:
newsDF.take(2)

[Row(category='Business', description=" Short sellers, Wall Street's dwindling band of ultra cynics, are seeing green again."),
 Row(category='Business', description=' Private investment firm Carlyle Group, which has a reputation for making well timed and occasionally controversial plays in the defense industry, has quietly placed its bets on another part of the market.')]

In [9]:
from pyspark.sql.functions import col

newsDF.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|   World|30000|
|Sci/Tech|30000|
|  Sports|30000|
|Business|30000|
+--------+-----+



## Building Classification Pipeline

### LogReg with CountVectorizer

Tokenizer: Tokenization 

stopwordsRemover: Remove Stop Words

countVectors: Count vectors (“document-term vectors”)

In [6]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

In [None]:
%%time

document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
      
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
      .setInputCols(["cleanTokens"]) \
      .setOutputCol("stem")

finisher = Finisher() \
      .setInputCols(["stem"]) \
      .setOutputCols(["token_features"]) \
      .setOutputAsArray(True) \
      .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            countVectors,
            label_stringIdx])

nlp_model = nlp_pipeline.fit(newsDF)

processed = nlp_model.transform(newsDF)

processed.count()

CPU times: user 605 ms, sys: 90.5 ms, total: 696 ms
Wall time: 1min 27s


In [None]:
processed.select('description','token_features').show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                       description|                                    token_features|
+--------------------------------------------------+--------------------------------------------------+
| Short sellers, Wall Street's dwindling band of...|[short, seller, wall, street, dwindl, band, ult...|
| Private investment firm Carlyle Group, which h...|[privat, invest, firm, carlyl, group, reput, ma...|
| Soaring crude prices plus worries about the ec...|[soar, crude, price, plu, worri, economi, outlo...|
| Authorities have halted oil export flows from ...|[author, halt, oil, export, flow, main, pipelin...|
| Tearaway world oil prices, toppling records an...|[tearawai, world, oil, price, toppl, record, st...|
| Stocks ended slightly higher on Friday but sta...|[stock, end, slightli, higher, fridai, staye, n...|
| Assets of the nation's retail money market mut...|[asset, nati

In [None]:
processed.select('token_features').take(2)

[Row(token_features=['short', 'seller', 'wall', 'street', 'dwindl', 'band', 'ultra', 'cynic', 'see', 'green']),
 Row(token_features=['privat', 'invest', 'firm', 'carlyl', 'group', 'reput', 'make', 'well', 'time', 'occasion', 'controversi', 'plai', 'defens', 'industri', 'quietli', 'place', 'bet', 'anoth', 'part', 'market'])]

In [None]:
processed.select('features').take(2)

[Row(features=SparseVector(10000, {241: 1.0, 384: 1.0, 467: 1.0, 743: 1.0, 837: 1.0, 2233: 1.0, 3690: 1.0, 6224: 1.0, 6295: 1.0})),
 Row(features=SparseVector(10000, {26: 1.0, 38: 1.0, 46: 1.0, 68: 1.0, 117: 1.0, 155: 1.0, 182: 1.0, 197: 1.0, 245: 1.0, 304: 1.0, 320: 1.0, 407: 1.0, 427: 1.0, 621: 1.0, 867: 1.0, 2362: 1.0, 2834: 1.0, 2861: 1.0, 6877: 1.0}))]

In [None]:
processed.select('description','features','label').show()

+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|(10000,[241,384,4...|  0.0|
| Private investme...|(10000,[26,38,46,...|  0.0|
| Soaring crude pr...|(10000,[15,28,46,...|  0.0|
| Authorities have...|(10000,[0,32,35,4...|  0.0|
| Tearaway world o...|(10000,[1,2,11,28...|  0.0|
| Stocks ended sli...|(10000,[3,13,14,2...|  0.0|
| Assets of the na...|(10000,[0,4,10,15...|  0.0|
| Retail sales bou...|(10000,[0,1,10,15...|  0.0|
|" After earning a...|(10000,[98,99,125...|  0.0|
| Short sellers, W...|(10000,[241,384,4...|  0.0|
| Soaring crude pr...|(10000,[15,28,46,...|  0.0|
| OPEC can do noth...|(10000,[0,24,28,2...|  0.0|
| Non OPEC oil exp...|(10000,[0,21,28,3...|  0.0|
| WASHINGTON/NEW Y...|(10000,[2,4,13,14...|  0.0|
| The dollar tumbl...|(10000,[2,14,72,1...|  0.0|
|If you think you ...|(10000,[74,77,143...|  0.0|
|The purchasing po...|(10000,[46,54,167...|  0.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84003
Test Dataset Count: 35997


In [None]:
trainingData.printSchema()

root
 |-- category: string (nullable = true)
 |-- description: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |   

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|" U.S. stocks were slightly...|Business|[0.9994155510323468,1.76045...|  0.0|       0.0|
|" U.S. blue chips declined ...|Business|[0.9973677889517072,9.95949...|  0.0|       0.0|
|Attorney General Thomas F. ...|Business|[0.9967004676713661,0.00113...|  0.0|       0.0|
|" Stocks fell sharply  on W...|Business|[0.9956763796130114,0.00121...|  0.0|       0.0|
|The airline sector, clouded...|Business|[0.994569929878784,0.002572...|  0.0|       0.0|
|" Shares of Ford Motor Co. ...|Business|[0.9937297313930828,0.00210...|  0.0|       0.0|
|" Stocks slipped on Tuesday...|Business|[0.9937243262893779,0.00210...|  0.0|       0.0|
|" Mid priced clothing retai...|Business|[0.993188337679912,0.003267...|  0.0|       0.0|
| OPEC oil

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

evaluator.evaluate(predictions)

0.9012594146000878

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_true = predictions.select("label")
y_true = y_true.toPandas()

y_pred = predictions.select("prediction")
y_pred = y_pred.toPandas()

In [None]:
y_pred.prediction.value_counts()

2.0    9378
1.0    9079
0.0    9015
3.0    8525
Name: prediction, dtype: int64

In [None]:
cnf_matrix = confusion_matrix(list(y_true.label.astype(int)), list(y_pred.prediction.astype(int)))
cnf_matrix

array([[7794,  791,  105,  287],
       [ 702, 7898,   85,  298],
       [  47,   76, 8905,   88],
       [ 472,  314,  283, 7852]])

In [None]:
print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.86      0.87      0.87      8977
         1.0       0.87      0.88      0.87      8983
         2.0       0.95      0.98      0.96      9116
         3.0       0.92      0.88      0.90      8921

    accuracy                           0.90     35997
   macro avg       0.90      0.90      0.90     35997
weighted avg       0.90      0.90      0.90     35997

0.9014362307970109


### LogReg with TFIDF

In [None]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
            idf,
            label_stringIdx])

nlp_model_tf = nlp_pipeline_tf.fit(newsDF)

processed_tf = nlp_model_tf.transform(newsDF)

processed_tf.count()


120000

In [None]:
# set seed for reproducibility
processed_tf.select('description','features','label').show()

+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|(10000,[551,621,6...|  0.0|
| Private investme...|(10000,[157,831,9...|  0.0|
| Soaring crude pr...|(10000,[793,1738,...|  0.0|
| Authorities have...|(10000,[1548,1611...|  0.0|
| Tearaway world o...|(10000,[323,585,1...|  0.0|
| Stocks ended sli...|(10000,[453,609,6...|  0.0|
| Assets of the na...|(10000,[258,444,1...|  0.0|
| Retail sales bou...|(10000,[14,585,19...|  0.0|
|" After earning a...|(10000,[114,796,1...|  0.0|
| Short sellers, W...|(10000,[551,621,6...|  0.0|
| Soaring crude pr...|(10000,[793,1738,...|  0.0|
| OPEC can do noth...|(10000,[298,616,9...|  0.0|
| Non OPEC oil exp...|(10000,[616,1063,...|  0.0|
| WASHINGTON/NEW Y...|(10000,[360,832,1...|  0.0|
| The dollar tumbl...|(10000,[419,949,1...|  0.0|
|If you think you ...|(10000,[1041,2059...|  0.0|
|The purchasing po...|(10000,[901,2198,...|  0.0|


In [None]:
(trainingData, testData) = processed_tf.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84003
Test Dataset Count: 35997


In [None]:
lrModel_tf = lr.fit(trainingData)

predictions_tf = lrModel_tf.transform(testData)

predictions_tf.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|" U.S. stocks were slightly...|Business|[0.9980625229523017,0.00110...|  0.0|       0.0|
|Attorney General Thomas F. ...|Business|[0.9954633492899814,0.00198...|  0.0|       0.0|
|" Stocks fell sharply  on W...|Business|[0.9952042633903164,0.00168...|  0.0|       0.0|
|" U.S. regulators asked aut...|Business|[0.994793606580869,0.002037...|  0.0|       0.0|
|Former Enron Corp. executiv...|Business|[0.9934328912836528,0.00284...|  0.0|       0.0|
|" Mid priced clothing retai...|Business|[0.9934062625525654,0.00240...|  0.0|       0.0|
|" Stocks slipped on Tuesday...|Business|[0.992721031666177,0.002093...|  0.0|       0.0|
|In NEW YORK, the major stoc...|Business|[0.9922994062426923,0.00273...|  0.0|       0.0|
| Interest

In [None]:
y_true = predictions_tf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_tf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.85      0.86      0.85      8977
         1.0       0.86      0.86      0.86      8983
         2.0       0.94      0.96      0.95      9116
         3.0       0.91      0.87      0.89      8921

    accuracy                           0.89     35997
   macro avg       0.89      0.89      0.89     35997
weighted avg       0.89      0.89      0.89     35997

0.889018529321888


### Random Forest with TFIDF

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions_rf = rfModel.transform(testData)


In [None]:
predictions_rf.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|Wall Street stocks moved mo...|Business|[0.41147192022464146,0.2263...|  0.0|       0.0|
|" Stocks fell sharply  on W...|Business|[0.3948031123505214,0.21503...|  0.0|       0.0|
|Amazon.com Inc (AMZN.O: Quo...|Business|[0.3851454394676118,0.24276...|  0.0|       0.0|
|" U.S. stocks opened lower ...|Business|[0.3851324881642944,0.22016...|  0.0|       0.0|
| Stocks opened higher on Tu...|Business|[0.3843000856059863,0.21971...|  0.0|       0.0|
| Investors sent stocks shar...|   World|[0.3834645051102415,0.22326...|  3.0|       0.0|
| U.S. stocks jumped higher ...|Business|[0.3824948841679754,0.22218...|  0.0|       0.0|
|" U.S. stocks were slightly...|Business|[0.3817594575600658,0.23123...|  0.0|       0.0|
|" U.S. st

In [None]:
y_true = predictions_rf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_rf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.76      0.69      0.72      8977
         1.0       0.78      0.67      0.72      8983
         2.0       0.72      0.90      0.80      9116
         3.0       0.78      0.77      0.77      8921

    accuracy                           0.76     35997
   macro avg       0.76      0.76      0.75     35997
weighted avg       0.76      0.76      0.75     35997

0.7570075284051448


## LogReg with Spark NLP Glove Word Embeddings

In [None]:
document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
    
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document",'cleanTokens'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
      .setInputCols(["sentence_embeddings"]) \
      .setOutputCols(["finished_sentence_embeddings"]) \
      .setOutputAsVector(True)\
      .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
      "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_w2v = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            glove_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            explodeVectors,
            label_stringIdx])

nlp_model_w2v = nlp_pipeline_w2v.fit(newsDF)

processed_w2v = nlp_model_w2v.transform(newsDF)

processed_w2v.count()


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


120000

In [None]:
processed_w2v.columns

['features',
 'category',
 'description',
 'document',
 'token',
 'normalized',
 'cleanTokens',
 'embeddings',
 'sentence_embeddings',
 'finished_sentence_embeddings',
 'label']

In [None]:
processed_w2v.show(5)

+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+-----+
|            features|category|         description|            document|               token|          normalized|         cleanTokens|          embeddings| sentence_embeddings|finished_sentence_embeddings|label|
+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+-----+
|[-0.1556767076253...|Business| Short sellers, W...|[{document, 0, 84...|[{token, 1, 5, Sh...|[{token, 1, 5, Sh...|[{token, 1, 5, Sh...|[{word_embeddings...|[{sentence_embedd...|        [[-0.155676707625...|  0.0|
|[-0.0144653050228...|Business| Private investme...|[{document, 0, 20...|[{token, 1, 7, Pr...|[{token, 1, 7, Pr...|[{token, 1, 7, Pr...|[{word_e

In [None]:
processed_w2v.select('finished_sentence_embeddings').take(1)

[Row(finished_sentence_embeddings=[DenseVector([-0.1557, 0.196, 0.1099, -0.3089, 0.16, 0.1672, -0.4649, -0.1101, -0.053, -0.1551, 0.0327, 0.0772, 0.1494, -0.1865, 0.1155, -0.0597, 0.0234, -0.0451, 0.2361, -0.0089, 0.3358, 0.0444, 0.0088, -0.1453, 0.2289, 0.0914, -0.1665, -0.3726, 0.1892, 0.121, 0.1993, -0.0239, -0.1346, 0.1159, 0.2086, 0.1285, 0.068, 0.1372, 0.3153, -0.1934, 0.0257, -0.226, -0.0984, 0.1139, 0.1413, -0.3743, 0.072, 0.1403, 0.251, -0.3106, 0.1709, -0.0697, -0.0554, 0.5123, -0.1873, -1.7784, 0.0295, 0.1014, 0.9268, 0.2129, -0.1354, 0.5739, -0.0679, 0.461, 0.4216, 0.0225, 0.4456, -0.2462, 0.1411, -0.3258, 0.0025, 0.0114, -0.3895, -0.1106, -0.261, 0.0147, 0.0781, 0.1268, -0.2042, -0.2278, 0.5096, 0.1539, -0.3515, -0.0102, -0.7003, -0.3872, -0.1668, -0.2405, -0.0766, 0.1396, -0.0592, -0.1568, -0.1606, -0.1371, -0.684, -0.2549, -0.1541, 0.1536, 0.2715, 0.3342])])]

In [None]:
# IF SQLTransformer IS NOT USED INSIDE THE PIPELINE, WE CAN EXPLODE OUTSIDE
from pyspark.sql.functions import explode

# processed_w2v= processed_w2v.withColumn("features", explode(processed_w2v.finished_sentence_embeddings))

In [None]:
processed_w2v.select("features").take(1)

[Row(features=DenseVector([-0.1557, 0.196, 0.1099, -0.3089, 0.16, 0.1672, -0.4649, -0.1101, -0.053, -0.1551, 0.0327, 0.0772, 0.1494, -0.1865, 0.1155, -0.0597, 0.0234, -0.0451, 0.2361, -0.0089, 0.3358, 0.0444, 0.0088, -0.1453, 0.2289, 0.0914, -0.1665, -0.3726, 0.1892, 0.121, 0.1993, -0.0239, -0.1346, 0.1159, 0.2086, 0.1285, 0.068, 0.1372, 0.3153, -0.1934, 0.0257, -0.226, -0.0984, 0.1139, 0.1413, -0.3743, 0.072, 0.1403, 0.251, -0.3106, 0.1709, -0.0697, -0.0554, 0.5123, -0.1873, -1.7784, 0.0295, 0.1014, 0.9268, 0.2129, -0.1354, 0.5739, -0.0679, 0.461, 0.4216, 0.0225, 0.4456, -0.2462, 0.1411, -0.3258, 0.0025, 0.0114, -0.3895, -0.1106, -0.261, 0.0147, 0.0781, 0.1268, -0.2042, -0.2278, 0.5096, 0.1539, -0.3515, -0.0102, -0.7003, -0.3872, -0.1668, -0.2405, -0.0766, 0.1396, -0.0592, -0.1568, -0.1606, -0.1371, -0.684, -0.2549, -0.1541, 0.1536, 0.2715, 0.3342]))]

In [None]:
processed_w2v.select("features").take(1)

[Row(features=DenseVector([-0.1557, 0.196, 0.1099, -0.3089, 0.16, 0.1672, -0.4649, -0.1101, -0.053, -0.1551, 0.0327, 0.0772, 0.1494, -0.1865, 0.1155, -0.0597, 0.0234, -0.0451, 0.2361, -0.0089, 0.3358, 0.0444, 0.0088, -0.1453, 0.2289, 0.0914, -0.1665, -0.3726, 0.1892, 0.121, 0.1993, -0.0239, -0.1346, 0.1159, 0.2086, 0.1285, 0.068, 0.1372, 0.3153, -0.1934, 0.0257, -0.226, -0.0984, 0.1139, 0.1413, -0.3743, 0.072, 0.1403, 0.251, -0.3106, 0.1709, -0.0697, -0.0554, 0.5123, -0.1873, -1.7784, 0.0295, 0.1014, 0.9268, 0.2129, -0.1354, 0.5739, -0.0679, 0.461, 0.4216, 0.0225, 0.4456, -0.2462, 0.1411, -0.3258, 0.0025, 0.0114, -0.3895, -0.1106, -0.261, 0.0147, 0.0781, 0.1268, -0.2042, -0.2278, 0.5096, 0.1539, -0.3515, -0.0102, -0.7003, -0.3872, -0.1668, -0.2405, -0.0766, 0.1396, -0.0592, -0.1568, -0.1606, -0.1371, -0.684, -0.2549, -0.1541, 0.1536, 0.2715, 0.3342]))]

In [None]:
processed_w2v.select('description','features','label').show()


+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|[-0.1556767076253...|  0.0|
| Private investme...|[-0.0144653050228...|  0.0|
| Soaring crude pr...|[0.10348732769489...|  0.0|
| Authorities have...|[-0.0355810523033...|  0.0|
| Tearaway world o...|[0.00647281948477...|  0.0|
| Stocks ended sli...|[0.20069395005702...|  0.0|
| Assets of the na...|[0.38012433052062...|  0.0|
| Retail sales bou...|[0.20352847874164...|  0.0|
|" After earning a...|[0.13536226749420...|  0.0|
| Short sellers, W...|[-0.1556767076253...|  0.0|
| Soaring crude pr...|[0.10348732769489...|  0.0|
| OPEC can do noth...|[0.20307321846485...|  0.0|
| Non OPEC oil exp...|[0.09010648727416...|  0.0|
| WASHINGTON/NEW Y...|[0.10887209326028...|  0.0|
| The dollar tumbl...|[0.05723679438233...|  0.0|
|If you think you ...|[0.11463439464569...|  0.0|
|The purchasing po...|[0.05890964344143...|  0.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed_w2v.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84003
Test Dataset Count: 35997


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

testData = testData.where(num_nonzeros("features") != 0)

In [None]:
lrModel_w2v = lr.fit(trainingData)

In [None]:
predictions_w2v = lrModel_w2v.transform(testData)

predictions_w2v.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|THE stock market is rising,...|Business|[0.9891063193483841,0.00550...|  0.0|       0.0|
|Wachovia Corp. said Friday ...|Business|[0.9885060326689832,0.00899...|  0.0|       0.0|
| Stocks fell on Monday, wit...|Business|[0.9853643008568156,0.00998...|  0.0|       0.0|
|Japan #39;s Nikkei 225 Stoc...|Business|[0.9836332401598876,0.00733...|  0.0|       0.0|
| ChevronTexaco Corp., the N...|Business|[0.9819751792508788,0.01274...|  0.0|       0.0|
| US investment banking gian...|Business|[0.9817478639668558,0.01379...|  0.0|       0.0|
|  Shares of rival retailers...|Business|[0.9812756635225758,0.01221...|  0.0|       0.0|
| Goldman Sachs Group Inc. o...|Business|[0.9812715031901479,0.01081...|  0.0|       0.0|
| Tokyo st

In [None]:
y_true = predictions_w2v.select("label")
y_true = y_true.toPandas()

y_pred = predictions_w2v.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.82      0.82      9051
         1.0       0.82      0.81      0.82      9057
         2.0       0.93      0.96      0.94      8972
         3.0       0.88      0.87      0.87      8917

    accuracy                           0.86     35997
   macro avg       0.86      0.86      0.86     35997
weighted avg       0.86      0.86      0.86     35997

0.8642942467427841


In [None]:
processed_w2v.select('description','cleanTokens.result').show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                       description|                                            result|
+--------------------------------------------------+--------------------------------------------------+
| Short sellers, Wall Street's dwindling band of...|[Short, sellers, Wall, Streets, dwindling, band...|
| Private investment firm Carlyle Group, which h...|[Private, investment, firm, Carlyle, Group, rep...|
| Soaring crude prices plus worries about the ec...|[Soaring, crude, prices, plus, worries, economy...|
| Authorities have halted oil export flows from ...|[Authorities, halted, oil, export, flows, main,...|
| Tearaway world oil prices, toppling records an...|[Tearaway, world, oil, prices, toppling, record...|
| Stocks ended slightly higher on Friday but sta...|[Stocks, ended, slightly, higher, Friday, staye...|
| Assets of the nation's retail money market mut...|[Assets, nat

## LogReg with Spark NLP Bert Embeddings

In [None]:
document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
    
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

bert_embeddings = BertEmbeddings.pretrained('bert_base_cased', 'en') \
      .setInputCols(["document",'cleanTokens'])\
      .setOutputCol("bert")\
      .setCaseSensitive(False)\

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "bert"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
      .setInputCols(["sentence_embeddings"]) \
      .setOutputCols(["finished_sentence_embeddings"]) \
      .setOutputAsVector(True)\
      .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_bert = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            bert_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            label_stringIdx])

nlp_model_bert = nlp_pipeline_bert.fit(newsDF)

processed_bert = nlp_model_bert.transform(newsDF)

processed_bert.count()


bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [None]:
from pyspark.sql.functions import explode

processed_bert= processed_bert.withColumn("features", explode(processed_bert.finished_sentence_embeddings))

processed_bert.select('description','features','label').show()


+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
|Srinagar, Nov 6 (...|[-0.0763546451926...|  2.0|
|France's presiden...|[0.01601043716073...|  2.0|
|President  Bush s...|[0.11258428543806...|  2.0|
|Established Shiit...|[0.09958435595035...|  2.0|
|While Democrats p...|[-0.3666543066501...|  2.0|
|Rural and deprive...|[0.08482994884252...|  1.0|
| Terrell Owens is...|[-0.1571628898382...|  3.0|
|" Gov. Ed Rendell...|[-0.0437468327581...|  3.0|
| A month after a ...|[-0.1684152632951...|  3.0|
| No Diana Taurasi...|[-0.0047841807827...|  3.0|
| An upbeat Presid...|[0.15349867939949...|  2.0|
| Gay and lesbian ...|[0.17594610154628...|  2.0|
| Twenty three peo...|[-0.0070635229349...|  2.0|
|  Connecticut Att...|[0.13604542613029...|  0.0|
|A new report on g...|[0.07444920390844...|  1.0|
|That Michael Siew...|[0.23243072628974...|  1.0|
|Vice chairman of ...|[-0.2215369194746...|  1.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed_bert.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84045
Test Dataset Count: 35955


In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

testData = testData.where(num_nonzeros("features") != 0)

In [None]:
predictions = lrModel.transform(testData)

predictions.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|The Securities and Exchange...|Business|[0.9967407593636138,0.00300...|  0.0|       0.0|
|Stocks opened higher today,...|Business|[0.9928207319563264,0.00469...|  0.0|       0.0|
| Retailer Payless ShoeSourc...|Business|[0.9926546087578139,0.00674...|  0.0|       0.0|
|The insurance brokerage rep...|Business|[0.9917833732987117,0.00754...|  0.0|       0.0|
|Shell outlined a profit str...|Business|[0.9916303454148256,0.00808...|  0.0|       0.0|
| Countrywide Financial Corp...|Business|[0.9916172364634749,0.00514...|  0.0|       0.0|
|PITTSBURGH Mellon Financial...|Business|[0.9915578428166462,0.00799...|  0.0|       0.0|
|  Grocery wholesaler Flemin...|Business|[0.9915445608575104,0.00766...|  0.0|       0.0|
|Mark Head

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

df = predictions.select('description','category','label','prediction').toPandas()

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.82      0.79      0.80      8911
         1.0       0.81      0.80      0.81      8972
         2.0       0.84      0.86      0.85      9008
         3.0       0.90      0.94      0.92      9063

    accuracy                           0.85     35954
   macro avg       0.84      0.85      0.84     35954
weighted avg       0.84      0.85      0.85     35954

0.8459142237303221


## LogReg with ELMO Embeddings

In [None]:
%%time

document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
    
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

elmo_embeddings = ElmoEmbeddings.pretrained()\
      .setPoolingLayer("word_emb")\
      .setInputCols(["document",'cleanTokens'])\
      .setOutputCol("elmo")

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "elmo"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
      .setInputCols(["sentence_embeddings"]) \
      .setOutputCols(["finished_sentence_embeddings"]) \
      .setOutputAsVector(True)\
      .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_elmo = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            elmo_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            label_stringIdx])

nlp_model_elmo = nlp_pipeline_elmo.fit(newsDF)

processed_elmo = nlp_model_elmo.transform(newsDF)

processed_elmo.count()


elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]
CPU times: user 406 ms, sys: 119 ms, total: 525 ms
Wall time: 24 s


In [6]:
(trainingData, testData) = newsDF.randomSplit([0.7, 0.3], seed = 100)

In [None]:
processed_trainingData = nlp_model_elmo.transform(trainingData)

processed_trainingData.count()

84003

In [None]:
processed_testData = nlp_model_elmo.transform(testData)

processed_testData.count()

35997

In [None]:
processed_trainingData.columns

['category',
 'description',
 'document',
 'token',
 'normalized',
 'cleanTokens',
 'elmo',
 'sentence_embeddings',
 'finished_sentence_embeddings',
 'label']

In [None]:
from pyspark.sql.functions import explode

processed_testData= processed_testData.withColumn("features", explode(processed_testData.finished_sentence_embeddings))

processed_trainingData= processed_trainingData.withColumn("features", explode(processed_trainingData.finished_sentence_embeddings))


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

processed_testData = processed_testData.where(num_nonzeros("features") != 0)

In [None]:
%%time

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(processed_trainingData)


CPU times: user 36.3 s, sys: 4.92 s, total: 41.2 s
Wall time: 1h 36min 4s


In [None]:
processed_trainingData.columns

['category',
 'description',
 'document',
 'token',
 'normalized',
 'cleanTokens',
 'elmo',
 'sentence_embeddings',
 'finished_sentence_embeddings',
 'label',
 'features']

In [None]:
predictions = lrModel.transform(processed_testData)

predictions.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|" Exxon Mobil Corp. &lt;A H...|Business|[0.9938693762832211,0.00383...|  0.0|       0.0|
|" Exxon Mobil Corp. &lt;A H...|Business|[0.9938693762832211,0.00383...|  0.0|       0.0|
|The No. 2 U.S. oil company ...|Business|[0.9932208044764538,0.00497...|  0.0|       0.0|
| Falling oil prices and str...|   World|[0.9928751967512169,0.00663...|  3.0|       0.0|
|" Stocks fell sharply  on W...|Business|[0.9922198146618901,0.00458...|  0.0|       0.0|
| Kmart Holding Corporation,...|Business|[0.9912332080126981,0.00698...|  0.0|       0.0|
|Wachovia Corp. said Friday ...|Business|[0.9912287561757839,0.00696...|  0.0|       0.0|
|Lennar Corp., the No. 3 US ...|Business|[0.9907540917815972,0.00788...|  0.0|       0.0|
|US stocks

In [None]:
df = predictions.select('description','category','label','prediction').toPandas()

In [None]:
df.shape

(35997, 4)

In [None]:
df.head()

Unnamed: 0,description,category,label,prediction
0,10/22/04 The board of the Walt Disney Com...,Business,0.0,0.0
1,A closely watched barometer of future econom...,Business,0.0,0.0
2,A consortium led by Sony Corp. of America an...,Business,0.0,0.0
3,A federal appeals court agreed Tuesday to te...,Business,0.0,0.0
4,A federal bankruptcy judge ruled against Uni...,Business,0.0,0.0


In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83      8977
         1.0       0.84      0.82      0.83      8983
         2.0       0.94      0.96      0.95      9116
         3.0       0.87      0.87      0.87      8921

    accuracy                           0.87     35997
   macro avg       0.87      0.87      0.87     35997
weighted avg       0.87      0.87      0.87     35997

0.8701002861349557


## LogReg with Universal Sentence Encoder

In [7]:
useEmbeddings = UniversalSentenceEncoder.pretrained()\
      .setInputCols("document")\
      .setOutputCol("use_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [8]:
document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")

loaded_useEmbeddings = UniversalSentenceEncoder.load('/root/cache_pretrained/tfhub_use_en_2.4.0_2.4_1587136330099')\
      .setInputCols("document")\
      .setOutputCol("use_embeddings")

embeddings_finisher = EmbeddingsFinisher() \
      .setInputCols(["use_embeddings"]) \
      .setOutputCols(["finished_use_embeddings"]) \
      .setOutputAsVector(True)\
      .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

use_pipeline = Pipeline(
      stages=[
        document_assembler,
        loaded_useEmbeddings,
        embeddings_finisher,
        label_stringIdx]
      )

use_df = use_pipeline.fit(newsDF).transform(newsDF)

In [9]:
use_df.select('finished_use_embeddings').show(3)

+-----------------------+
|finished_use_embeddings|
+-----------------------+
|   [[0.0441501587629...|
|   [[0.0844451710581...|
|   [[0.0426647365093...|
+-----------------------+
only showing top 3 rows



In [10]:
from pyspark.sql.functions import explode

use_df= use_df.withColumn("features", explode(use_df.finished_use_embeddings))

In [11]:
use_df.show(2)

+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|category|         description|            document|      use_embeddings|finished_use_embeddings|label|            features|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|Business| Short sellers, W...|[{document, 0, 84...|[{sentence_embedd...|   [[0.0441501587629...|  0.0|[0.04415015876293...|
|Business| Private investme...|[{document, 0, 20...|[{sentence_embedd...|   [[0.0844451710581...|  0.0|[0.08444517105817...|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
only showing top 2 rows



In [12]:
# set seed for reproducibility
(trainingData, testData) = use_df.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84003
Test Dataset Count: 35997


In [13]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|Diversified manufacturer an...|Business|[0.9865484610819308,0.00789...|  0.0|       0.0|
|" U.S. stocks were slightly...|Business|[0.9850439113216918,0.00675...|  0.0|       0.0|
|Kroger Co., the largest US ...|Business|[0.9831770850020914,0.00924...|  0.0|       0.0|
| Kroger Co. , the largest U...|Business|[0.9831481473594404,0.00940...|  0.0|       0.0|
| Safeway Inc. , the third l...|Business|[0.982848913095213,0.009522...|  0.0|       0.0|
|" U.S. stocks opened lower ...|Business|[0.9825013211776168,0.00568...|  0.0|       0.0|
|Fannie Mae, the largest US ...|Business|[0.9819475216368533,0.00984...|  0.0|       0.0|
|Prudential Financial Inc., ...|Business|[0.9816735479911963,0.01117...|  0.0|       0.0|
|Costco Wh

In [14]:
df = predictions.select('description','category','label','prediction').toPandas()
#df['result'] = df['result'].apply(lambda x: x[0])

In [15]:
df.head()

Unnamed: 0,description,category,label,prediction
0,10/22/04 The board of the Walt Disney Com...,Business,0.0,0.0
1,A closely watched barometer of future econom...,Business,0.0,0.0
2,A consortium led by Sony Corp. of America an...,Business,0.0,0.0
3,A federal appeals court agreed Tuesday to te...,Business,0.0,0.0
4,A federal bankruptcy judge ruled against Uni...,Business,0.0,0.0


In [16]:
print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.84      0.84      0.84      8977
         1.0       0.85      0.85      0.85      8983
         2.0       0.96      0.97      0.96      9116
         3.0       0.90      0.88      0.89      8921

    accuracy                           0.88     35997
   macro avg       0.88      0.88      0.88     35997
weighted avg       0.88      0.88      0.88     35997

0.8847126149401339


### train on entire dataset

In [17]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(use_df)

In [18]:
test_df = spark.read.csv("/content/news_category_test.csv",header=True)

In [19]:
test_df = use_pipeline.fit(test_df).transform(test_df)

In [20]:
test_df= test_df.withColumn("features", explode(test_df.finished_use_embeddings))

In [21]:
test_df.show(2)

+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|category|         description|            document|      use_embeddings|finished_use_embeddings|label|            features|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|Business|Unions representi...|[{document, 0, 12...|[{sentence_embedd...|   [[0.0129975387826...|  0.0|[0.01299753878265...|
|Sci/Tech| TORONTO, Canada ...|[{document, 0, 22...|[{sentence_embedd...|   [[0.0019999046344...|  1.0|[0.00199990463443...|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
only showing top 2 rows



In [22]:
from pyspark.sql.functions import col

test_df.groupBy("category","label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+-----+
|category|label|count|
+--------+-----+-----+
|Sci/Tech|  1.0| 1900|
|  Sports|  2.0| 1900|
|   World|  3.0| 1900|
|Business|  0.0| 1900|
+--------+-----+-----+



In [23]:
predictions = lrModel.transform(test_df)

In [24]:
df = predictions.select('description','category','label','prediction').toPandas()

In [25]:
df['label'] = df.category.replace({'World':3.0,
                    'Sports':2.0,
                    'Business':0.0,
                    'Sci/Tech':1.0})

In [26]:
df.head()

Unnamed: 0,description,category,label,prediction
0,Unions representing workers at Turner Newall...,Business,0.0,0.0
1,"TORONTO, Canada A second team of rocketeer...",Sci/Tech,1.0,1.0
2,A company founded by a chemistry researcher a...,Sci/Tech,1.0,1.0
3,It's barely dawn when Mike Fitzpatrick starts...,Sci/Tech,1.0,1.0
4,Southern California's smog fighting agency we...,Sci/Tech,1.0,0.0


In [27]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83      1900
         1.0       0.84      0.85      0.85      1900
         2.0       0.95      0.97      0.96      1900
         3.0       0.90      0.87      0.89      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600

0.8801315789473684


# ClassifierDL

In [4]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.load('/root/cache_pretrained/tfhub_use_en_2.4.0_2.4_1587136330099')\
      .setInputCols("document")\
      .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setMaxEpochs(5)\
    .setEnableOutputLogs(True)

pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [10]:
# set seed for reproducibility
(trainingData, testData) = newsDF.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84003
Test Dataset Count: 35997


In [8]:
pipelineModel = pipeline.fit(trainingData)

In [9]:
pipelineModel.stages[2].write().overwrite().save('classifierDL_model_5e')

In [10]:
from sklearn.metrics import classification_report, accuracy_score

df = pipelineModel.transform(testData).select('category','description',"class.result").toPandas()

df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.85      0.85      0.85      8977
    Sci/Tech       0.87      0.86      0.86      8983
      Sports       0.95      0.98      0.97      9116
       World       0.90      0.89      0.90      8921

    accuracy                           0.90     35997
   macro avg       0.90      0.90      0.90     35997
weighted avg       0.90      0.90      0.90     35997

0.8956857515904103


## Loading the trained classifier from disk

In [11]:
import sparknlp
sparknlp.__path__

['/usr/local/lib/python3.7/dist-packages/sparknlp']

In [4]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("news_category_train.csv")

In [13]:
trainDataset.count()

120000

In [14]:
trainingData.count()

84003

In [15]:
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.load('/root/cache_pretrained/tfhub_use_en_2.4.0_2.4_1587136330099')\
      .setInputCols("document")\
      .setOutputCol("sentence_embeddings")

classsifierdlmodel = ClassifierDLModel.load('classifierDL_model_5e')

pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdlmodel
    ])

In [16]:
pipeline.fit(testData.limit(10)).transform(testData.limit(10)).select('category','description',"class.result").show(10, truncate=50)

+--------+--------------------------------------------------+----------+
|category|                                       description|    result|
+--------+--------------------------------------------------+----------+
|Business|  10/22/04    The board of the Walt Disney Comp...|[Business]|
|Business|  A closely watched barometer of future economi...|[Business]|
|Business|  A consortium led by Sony Corp. of America ann...|[Business]|
|Business|  A federal appeals court agreed Tuesday to tem...|[Business]|
|Business|  A federal bankruptcy judge ruled against Unit...|[Business]|
|Business|  A federal judge has scheduled jury selection ...|[Business]|
|Business|  A federal judge today ordered Martha Stewart ...|[Business]|
|Business|  A new  $50 bill with touches of red, blue and...|[Business]|
|Business|  A roaring jobs market in Canada and disappoin...|[Business]|
|Business|  A top networking gear producer plans to offer...|[Sci/Tech]|
+--------+-----------------------------------------

In [17]:
lm = LightPipeline(pipeline.fit(spark.createDataFrame([[""]]).toDF("text")))
lm.annotate('In its first two years, the UK dedicated card companies have surge')

{'class': ['Sci/Tech'],
 'document': ['In its first two years, the UK dedicated card companies have surge'],
 'sentence_embeddings': ['In its first two years, the UK dedicated card companies have surge']}

In [18]:
text='''
Fearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.
'''

In [19]:
lm = LightPipeline(pipeline.fit(spark.createDataFrame([[""]]).toDF("text")))

lm.annotate(text)

{'class': ['World'],
 'document': ['\nFearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.\n'],
 'sentence_embeddings': ['\nFearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.\n']}

# Classifier DL + Glove + Basic text processing

In [3]:
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
      .setInputCols(["token"]) \
      .setOutputCol("lemma")
      
glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

lemma_pipeline = Pipeline(
    stages=[document, 
            tokenizer,
            lemma,
            glove_embeddings])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [11]:
lemma_pipeline.fit(trainingData.limit(1000)).transform(trainingData.limit(1000)).show(truncate=30)

+--------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
|category|                   description|                      document|                         token|                         lemma|                    embeddings|
+--------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
|Business|    The credit rating of th...|[{document, 0, 164,     The...|[{token, 4, 6, The, {senten...|[{token, 4, 6, The, {senten...|[{word_embeddings, 4, 6, Th...|
|Business|   Jeans maker Levi Strauss...|[{document, 0, 186,    Jean...|[{token, 3, 7, Jeans, {sent...|[{token, 3, 7, Jeans, {sent...|[{word_embeddings, 3, 7, Je...|
|Business|  ''The Oprah Winfrey Show ...|[{document, 0, 131,   ''The...|[{token, 2, 3, '', {sentenc...|[{token, 2, 3, '', {sentenc...|[{word_embeddings, 2, 3, ''...|
|Bus

In [12]:
document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
    
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
      .setInputCols(["cleanTokens"]) \
      .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("category")\
      .setMaxEpochs(5)\
      .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [13]:
!rm -rf classifier_dl_pipeline_glove

In [14]:
clf_pipeline.save('classifier_dl_pipeline_glove')

In [15]:
clf_pipelineModel = clf_pipeline.fit(trainingData)

In [26]:
df = clf_pipelineModel.transform(testData).select('category','description',"class.result").toPandas()

df.head()

Unnamed: 0,category,description,result
0,Business,10/22/04 The board of the Walt Disney Com...,[Business]
1,Business,A closely watched barometer of future econom...,[Business]
2,Business,A consortium led by Sony Corp. of America an...,[Business]
3,Business,A federal appeals court agreed Tuesday to te...,[Business]
4,Business,A federal bankruptcy judge ruled against Uni...,[Business]


In [27]:
from sklearn.metrics import classification_report, accuracy_score

df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))

print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.84      0.85      0.84      8977
    Sci/Tech       0.85      0.85      0.85      8983
      Sports       0.94      0.97      0.96      9116
       World       0.91      0.87      0.89      8921

    accuracy                           0.89     35997
   macro avg       0.89      0.89      0.89     35997
weighted avg       0.89      0.89      0.89     35997

0.886129399672195


In [None]:
!cd data && ls -l

In [19]:
import pandas as pd

In [20]:
news_df = newsDF.toPandas()

In [21]:
news_df.head()

Unnamed: 0,category,description
0,Business,"Short sellers, Wall Street's dwindling band o..."
1,Business,"Private investment firm Carlyle Group, which ..."
2,Business,Soaring crude prices plus worries about the e...
3,Business,Authorities have halted oil export flows from...
4,Business,"Tearaway world oil prices, toppling records a..."


In [22]:
news_df.to_csv('news_dataset.csv', index=False)

In [23]:
document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")
      
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
      
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
      .setInputCols(["cleanTokens"]) \
      .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document",'lemma'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

txt_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [24]:
txt_pipelineModel = txt_pipeline.fit(testData.limit(1))

In [25]:
txt_pipelineModel.save('text_prep_pipeline_glove')