![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)



[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.1_Text_classification_examples_in_SparkML_SparkNLP.ipynb)

# Text Classification with Spark NLP

In [None]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import pandas as pd


In [None]:

import sparknlp
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

In [None]:
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv

In [None]:
# newsDF = spark.read.parquet("data/news_category.parquet") >> if it is a parquet

newsDF = spark.read \
      .option("header", True) \
      .csv("news_category_train.csv")

newsDF.show(truncate=50)

In [None]:
newsDF.show(truncate=50)

+--------+--------------------------------------------------+
|category|                                       description|
+--------+--------------------------------------------------+
|   World|Srinagar, Nov 6 (UNI) Two militants and a Borde...|
|   World|France's president orders his forces to destroy...|
|   World|President  Bush says he will reach out to allie...|
|   World|Established Shiite parties and powerful upstart...|
|   World|While Democrats placed their emphasis on the st...|
|Sci/Tech|Rural and deprived communities worldwide are be...|
|  Sports| Terrell Owens isn't the only once dancing in P...|
|  Sports|" Gov. Ed Rendell gets anxious watching contest...|
|  Sports| A month after a hotel worker accused NBA star ...|
|  Sports| No Diana Taurasi, no Alana Beard. Nicole Ohlde...|
|   World| An upbeat President Bush set forth an aggressi...|
|   World| Gay and lesbian advocates have been doing some...|
|   World| Twenty three people died and three others were...|
|Busines

In [None]:
newsDF.take(2)

[Row(category='Business', description=" Short sellers, Wall Street's dwindling band of ultra cynics, are seeing green again."),
 Row(category='Business', description=' Private investment firm Carlyle Group, which has a reputation for making well timed and occasionally controversial plays in the defense industry, has quietly placed its bets on another part of the market.')]

In [None]:
from pyspark.sql.functions import col

newsDF.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|Business|30000|
|  Sports|30000|
|   World|30000|
|Sci/Tech|30000|
+--------+-----+



## Building Classification Pipeline

### LogReg with CountVectorizer

Tokenizer: Tokenization 

stopwordsRemover: Remove Stop Words

countVectors: Count vectors (“document-term vectors”)

In [None]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer


In [None]:
%%time

document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
           countVectors,
           label_stringIdx])

nlp_model = nlp_pipeline.fit(newsDF)

processed = nlp_model.transform(newsDF)

processed.count()

CPU times: user 145 ms, sys: 32.8 ms, total: 178 ms
Wall time: 19 s


120000

In [None]:
processed.select('description','token_features').show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                       description|                                    token_features|
+--------------------------------------------------+--------------------------------------------------+
| Short sellers, Wall Street's dwindling band of...|[short, seller, wall, street, dwindl, band, ult...|
| Private investment firm Carlyle Group, which h...|[privat, invest, firm, carlyl, group, reput, ma...|
| Soaring crude prices plus worries about the ec...|[soar, crude, price, plu, worri, economi, outlo...|
| Authorities have halted oil export flows from ...|[author, halt, oil, export, flow, main, pipelin...|
| Tearaway world oil prices, toppling records an...|[tearawai, world, oil, price, toppl, record, st...|
| Stocks ended slightly higher on Friday but sta...|[stock, end, slightli, higher, fridai, staye, n...|
| Assets of the nation's retail money market mut...|[asset, nati

In [None]:
processed.select('token_features').take(2)

[Row(token_features=['short', 'seller', 'wall', 'street', 'dwindl', 'band', 'ultra', 'cynic', 'see', 'green']),
 Row(token_features=['privat', 'invest', 'firm', 'carlyl', 'group', 'reput', 'make', 'well', 'time', 'occasion', 'controversi', 'plai', 'defens', 'industri', 'quietli', 'place', 'bet', 'anoth', 'part', 'market'])]

In [None]:
processed.select('features').take(2)

[Row(features=SparseVector(10000, {241: 1.0, 384: 1.0, 467: 1.0, 744: 1.0, 837: 1.0, 2230: 1.0, 3677: 1.0, 6130: 1.0, 6283: 1.0})),
 Row(features=SparseVector(10000, {26: 1.0, 38: 1.0, 46: 1.0, 68: 1.0, 117: 1.0, 155: 1.0, 182: 1.0, 197: 1.0, 245: 1.0, 304: 1.0, 320: 1.0, 407: 1.0, 428: 1.0, 621: 1.0, 867: 1.0, 2364: 1.0, 2835: 1.0, 2861: 1.0, 6849: 1.0}))]

In [None]:
processed.select('description','features','label').show()

+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|(10000,[241,384,4...|  0.0|
| Private investme...|(10000,[26,38,46,...|  0.0|
| Soaring crude pr...|(10000,[15,28,46,...|  0.0|
| Authorities have...|(10000,[0,32,35,4...|  0.0|
| Tearaway world o...|(10000,[1,2,11,28...|  0.0|
| Stocks ended sli...|(10000,[3,13,14,2...|  0.0|
| Assets of the na...|(10000,[0,4,10,15...|  0.0|
| Retail sales bou...|(10000,[0,1,10,15...|  0.0|
|" After earning a...|(10000,[98,99,125...|  0.0|
| Short sellers, W...|(10000,[241,384,4...|  0.0|
| Soaring crude pr...|(10000,[15,28,46,...|  0.0|
| OPEC can do noth...|(10000,[0,24,28,2...|  0.0|
| Non OPEC oil exp...|(10000,[0,21,28,3...|  0.0|
| WASHINGTON/NEW Y...|(10000,[2,4,13,14...|  0.0|
| The dollar tumbl...|(10000,[2,14,72,1...|  0.0|
|If you think you ...|(10000,[74,77,143...|  0.0|
|The purchasing po...|(10000,[46,54,167...|  0.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84076
Test Dataset Count: 35924


In [None]:
trainingData.printSchema()

root
 |-- category: string (nullable = true)
 |-- description: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |   

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|  Sean Harrigan was ousted ...|   World|[0.9999051949476987,1.63436...|  2.0|       0.0|
|" U.S. stocks were slightly...|Business|[0.9995755945659964,1.58573...|  0.0|       0.0|
|" Financial services regula...|Business|[0.9985004285450796,5.81732...|  0.0|       0.0|
|Attorney General Thomas F. ...|Business|[0.9982656121521135,6.00818...|  0.0|       0.0|
|" U.S. blue chips declined ...|Business|[0.9976974351003786,9.61589...|  0.0|       0.0|
|   A federal judge approved...|   World|[0.9976493450596536,0.00193...|  2.0|       0.0|
|" Stocks were little change...|Business|[0.9974858308723592,0.00143...|  0.0|       0.0|
| Consumer prices rose by a ...|   World|[0.9952234500731161,0.00124...|  2.0|       0.0|
|" Stocks 

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

evaluator.evaluate(predictions)

0.8991035336378451

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_true = predictions_tf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_tf.select("prediction")
y_pred = y_pred.toPandas()


In [None]:
y_pred.prediction.value_counts()

3.0    9254
1.0    9196
0.0    8875
2.0    8599
Name: prediction, dtype: int64

In [None]:
cnf_matrix = confusion_matrix(list(y_true.label.astype(int)), list(y_pred.prediction.astype(int)))
cnf_matrix

array([[7692,  864,  269,   90],
       [ 675, 7933,  301,  110],
       [ 453,  325, 7918,  292],
       [  55,   74,  111, 8762]])

In [None]:

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      8915
           1       0.86      0.88      0.87      9019
           2       0.92      0.88      0.90      8988
           3       0.95      0.97      0.96      9002

    accuracy                           0.90     35924
   macro avg       0.90      0.90      0.90     35924
weighted avg       0.90      0.90      0.90     35924

0.8992595479345284


### LogReg with TFIDF

In [None]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
           idf,
           label_stringIdx])

nlp_model_tf = nlp_pipeline_tf.fit(newsDF)

processed_tf = nlp_model_tf.transform(newsDF)

processed_tf.count()


120000

In [None]:
# set seed for reproducibility
processed_tf.select('description','features','label').show()

+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|(10000,[25,625,66...|  0.0|
| Private investme...|(10000,[82,111,15...|  0.0|
| Soaring crude pr...|(10000,[410,1097,...|  0.0|
| Authorities have...|(10000,[1611,1637...|  0.0|
| Tearaway world o...|(10000,[1150,1427...|  0.0|
| Stocks ended sli...|(10000,[332,410,6...|  0.0|
| Assets of the na...|(10000,[1442,1788...|  0.0|
| Retail sales bou...|(10000,[25,117,97...|  0.0|
|" After earning a...|(10000,[114,643,7...|  0.0|
| Short sellers, W...|(10000,[25,625,66...|  0.0|
| Soaring crude pr...|(10000,[410,1097,...|  0.0|
| OPEC can do noth...|(10000,[616,904,1...|  0.0|
| Non OPEC oil exp...|(10000,[616,2224,...|  0.0|
| WASHINGTON/NEW Y...|(10000,[351,360,3...|  0.0|
| The dollar tumbl...|(10000,[359,456,9...|  0.0|
|If you think you ...|(10000,[1041,1564...|  0.0|
|The purchasing po...|(10000,[2198,4091...|  0.0|


In [None]:

(trainingData, testData) = processed_tf.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84076
Test Dataset Count: 35924


In [None]:
lrModel_tf = lr.fit(trainingData)

predictions_tf = lrModel_tf.transform(testData)

predictions_tf.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|" U.S. stocks were slightly...|Business|[0.996387898390177,0.001153...|  0.0|       0.0|
|   A federal judge approved...|   World|[0.995561655194328,0.002267...|  2.0|       0.0|
| Disappointing retail sales...|   World|[0.994511777283951,0.002542...|  2.0|       0.0|
|" Financial services regula...|Business|[0.9933079442733128,0.00309...|  0.0|       0.0|
|Marsh  amp; McLennan Cos. ,...|Business|[0.9932418919849173,0.00258...|  0.0|       0.0|
|  Sean Harrigan was ousted ...|   World|[0.9927684868502167,0.00212...|  2.0|       0.0|
|Banknorth Group's largest s...|Business|[0.9925446790520405,0.00361...|  0.0|       0.0|
| The dollar rose slightly i...|Business|[0.9921019348676328,0.00294...|  0.0|       0.0|
|US aerosp

In [None]:
y_true = predictions_tf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_tf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      8915
           1       0.85      0.86      0.86      9019
           2       0.91      0.87      0.89      8988
           3       0.93      0.96      0.95      9002

    accuracy                           0.89     35924
   macro avg       0.89      0.89      0.89     35924
weighted avg       0.89      0.89      0.89     35924

0.8865939204988309


### Random Forest with TFIDF

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions_rf = rfModel.transform(testData)


In [None]:

predictions_rf.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
| Best Buy Co. Inc. , the to...|Business|[0.38911855182695426,0.2277...|  0.0|       0.0|
| U.S. blue chip stocks fell...|Business|[0.3721192871952727,0.22265...|  0.0|       0.0|
| Staples Inc. (SPLS.O), the...|Business|[0.3669125286881084,0.22854...|  0.0|       0.0|
|United Parcel Service Inc. ...|Business|[0.36527694301773866,0.2392...|  0.0|       0.0|
| Disappointing retail sales...|   World|[0.36394253389697,0.2290257...|  2.0|       0.0|
|The Australian Gas Light Co...|Business|[0.3620963407047025,0.22832...|  0.0|       0.0|
| Kroger Co. , the top U.S. ...|Business|[0.36201015346984633,0.2232...|  0.0|       0.0|
| Texas Instruments Inc., th...|Business|[0.36191504725012813,0.2561...|  0.0|       0.0|
| Cosmetic

In [None]:
y_true = predictions_rf.select("label")
y_true = y_true.toPandas()

y_pred = predictions_rf.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.73      0.73      0.73      8915
         1.0       0.79      0.61      0.69      9019
         2.0       0.81      0.73      0.77      8988
         3.0       0.69      0.92      0.79      9002

    accuracy                           0.75     35924
   macro avg       0.76      0.75      0.74     35924
weighted avg       0.76      0.75      0.74     35924

0.7454626433582007


## LogReg with Spark NLP Glove Word Embeddings

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

glove_embeddings = WordEmbeddingsModel().pretrained() \
 .setInputCols(["document",'cleanTokens'])\
 .setOutputCol("embeddings")\
 .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
      "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_w2v = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            glove_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            explodeVectors,
           label_stringIdx])

nlp_model_w2v = nlp_pipeline_w2v.fit(newsDF)

processed_w2v = nlp_model_w2v.transform(newsDF)

processed_w2v.count()


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


120000

In [None]:
processed_w2v.select('finished_embeddings').take(1)

[Row(finished_embeddings=[" Short sellers, Wall Street's dwindling band of ultra cynics, are seeing green again."])]

In [None]:
processed_w2v.select("finished_embeddings").show(1)

+--------------------+
| finished_embeddings|
+--------------------+
|[ Short sellers, ...|
+--------------------+
only showing top 1 row



In [None]:
processed_w2v.select('finished_sentence_embeddings').take(1)

[Row(finished_sentence_embeddings=[DenseVector([-0.3996, -0.1811, -0.0817, -0.1405, -0.1474, 0.3627, 0.022, 0.2872, -0.0373, 0.3018, 0.2693, -0.0109, 0.4497, 0.3508, 0.1056, 0.0195, -0.0367, -0.3834, -0.6125, 0.0647, 0.8026, -0.2711, 0.1414, 0.2475, -0.125, -0.0847, -0.0201, -0.091, 0.059, 0.0927, -0.0997, -0.0921, -0.0728, 0.0223, 0.2052, -0.3765, -0.0727, 0.166, 0.1965, 0.2817, -0.3694, 0.101, 0.4664, 0.1242, 0.3345, 0.1135, -0.1376, 0.1322, -0.1108, -0.0528, 0.0252, -0.0141, -0.0596, 0.7911, -0.1946, -1.0185, -0.1145, -0.2511, 1.0126, 0.4601, -0.1696, 0.3242, 0.016, 0.1296, 0.1604, 0.2689, 0.0643, -0.0202, 0.0389, 0.3999, -0.0188, -0.5588, -0.2263, -0.2904, 0.0528, 0.2262, 0.3411, 0.2108, -0.6014, 0.0235, 0.496, -0.1189, 0.0201, -0.0828, -0.3899, 0.0564, 0.1625, 0.2013, -0.0906, 0.2355, 0.2936, 0.1562, -0.2338, 0.0981, -0.6123, 0.4507, 0.1272, 0.2078, 0.0752, 0.061])])]

In [None]:
# IF SQLTransformer IS NOT USED INSIDE THE PIPELINE, WE CAN EXPLODE OUTSIDE
from pyspark.sql.functions import explode

# processed_w2v= processed_w2v.withColumn("features", explode(processed_w2v.finished_sentence_embeddings))

In [None]:
processed_w2v.select("features").take(1)

[Row(features=DenseVector([-0.3996, -0.1811, -0.0817, -0.1405, -0.1474, 0.3627, 0.022, 0.2872, -0.0373, 0.3018, 0.2693, -0.0109, 0.4497, 0.3508, 0.1056, 0.0195, -0.0367, -0.3834, -0.6125, 0.0647, 0.8026, -0.2711, 0.1414, 0.2475, -0.125, -0.0847, -0.0201, -0.091, 0.059, 0.0927, -0.0997, -0.0921, -0.0728, 0.0223, 0.2052, -0.3765, -0.0727, 0.166, 0.1965, 0.2817, -0.3694, 0.101, 0.4664, 0.1242, 0.3345, 0.1135, -0.1376, 0.1322, -0.1108, -0.0528, 0.0252, -0.0141, -0.0596, 0.7911, -0.1946, -1.0185, -0.1145, -0.2511, 1.0126, 0.4601, -0.1696, 0.3242, 0.016, 0.1296, 0.1604, 0.2689, 0.0643, -0.0202, 0.0389, 0.3999, -0.0188, -0.5588, -0.2263, -0.2904, 0.0528, 0.2262, 0.3411, 0.2108, -0.6014, 0.0235, 0.496, -0.1189, 0.0201, -0.0828, -0.3899, 0.0564, 0.1625, 0.2013, -0.0906, 0.2355, 0.2936, 0.1562, -0.2338, 0.0981, -0.6123, 0.4507, 0.1272, 0.2078, 0.0752, 0.061]))]

In [None]:
processed_w2v.select("features").take(1)

[Row(features=DenseVector([-0.3996, -0.1811, -0.0817, -0.1405, -0.1474, 0.3627, 0.022, 0.2872, -0.0373, 0.3018, 0.2693, -0.0109, 0.4497, 0.3508, 0.1056, 0.0195, -0.0367, -0.3834, -0.6125, 0.0647, 0.8026, -0.2711, 0.1414, 0.2475, -0.125, -0.0847, -0.0201, -0.091, 0.059, 0.0927, -0.0997, -0.0921, -0.0728, 0.0223, 0.2052, -0.3765, -0.0727, 0.166, 0.1965, 0.2817, -0.3694, 0.101, 0.4664, 0.1242, 0.3345, 0.1135, -0.1376, 0.1322, -0.1108, -0.0528, 0.0252, -0.0141, -0.0596, 0.7911, -0.1946, -1.0185, -0.1145, -0.2511, 1.0126, 0.4601, -0.1696, 0.3242, 0.016, 0.1296, 0.1604, 0.2689, 0.0643, -0.0202, 0.0389, 0.3999, -0.0188, -0.5588, -0.2263, -0.2904, 0.0528, 0.2262, 0.3411, 0.2108, -0.6014, 0.0235, 0.496, -0.1189, 0.0201, -0.0828, -0.3899, 0.0564, 0.1625, 0.2013, -0.0906, 0.2355, 0.2936, 0.1562, -0.2338, 0.0981, -0.6123, 0.4507, 0.1272, 0.2078, 0.0752, 0.061]))]

In [None]:
processed_w2v.select('description','features','label').show()


+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
| Short sellers, W...|[-0.1556767076253...|  0.0|
| Private investme...|[-0.0144653050228...|  0.0|
| Soaring crude pr...|[0.10348732769489...|  0.0|
| Authorities have...|[-0.0355810523033...|  0.0|
| Tearaway world o...|[0.00647281948477...|  0.0|
| Stocks ended sli...|[0.20069395005702...|  0.0|
| Assets of the na...|[0.38012433052062...|  0.0|
| Retail sales bou...|[0.20352847874164...|  0.0|
|" After earning a...|[0.13536226749420...|  0.0|
| Short sellers, W...|[-0.1556767076253...|  0.0|
| Soaring crude pr...|[0.10348732769489...|  0.0|
| OPEC can do noth...|[0.20307321846485...|  0.0|
| Non OPEC oil exp...|[0.09010648727416...|  0.0|
| WASHINGTON/NEW Y...|[0.10887209326028...|  0.0|
| The dollar tumbl...|[0.05723679438233...|  0.0|
|If you think you ...|[0.11463439464569...|  0.0|
|The purchasing po...|[0.05890964344143...|  0.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed_w2v.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84076
Test Dataset Count: 35924


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

testData = testData.where(num_nonzeros("features") != 0)

In [None]:
testData.count()

35922

In [None]:
lrModel_w2v = lr.fit(trainingData)

In [None]:
predictions_w2v = lrModel_w2v.transform(testData)

predictions_w2v.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|Wachovia Corp. said Friday ...|Business|[0.9881375332949472,0.00933...|  0.0|       0.0|
| Stocks fell on Tuesday as ...|Business|[0.9875395796430402,0.00657...|  0.0|       0.0|
|Tokyo stocks plunged Tuesda...|Business|[0.9836560997852465,0.00471...|  0.0|       0.0|
| Financial services company...|Business|[0.9829326426472043,0.01029...|  0.0|       0.0|
|Goldman Sachs Group Inc. on...|Business|[0.9824775002947608,0.01053...|  0.0|       0.0|
|The steel tubing company re...|Business|[0.9814626385223189,0.01505...|  0.0|       0.0|
| Brokerage firm A.G. Edward...|Business|[0.9806832485814365,0.01494...|  0.0|       0.0|
|" Stocks fell sharply on We...|Business|[0.9806621577952687,0.00794...|  0.0|       0.0|
|Tokyo sto

+--------+--------------------+----------------------------+-----+--------------------+
|category|         description|finished_sentence_embeddings|label|            features|
+--------+--------------------+----------------------------+-----+--------------------+
|Sci/Tech|             "\""Now|                        [[]]|  1.0|                  []|
|Sci/Tech|&lt;strong&gt;Let...|        [[0.0,0.0,0.0,0.0...|  1.0|[0.0,0.0,0.0,0.0,...|
+--------+--------------------+----------------------------+-----+--------------------+



In [None]:
y_true = predictions_w2v.select("label")
y_true = y_true.toPandas()

y_pred = predictions_w2v.select("prediction")
y_pred = y_pred.toPandas()

print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83      8915
         1.0       0.83      0.82      0.83      9017
         2.0       0.87      0.87      0.87      8988
         3.0       0.93      0.96      0.94      9002

    accuracy                           0.87     35922
   macro avg       0.87      0.87      0.87     35922
weighted avg       0.87      0.87      0.87     35922

0.8675185123322755


In [None]:
processed_w2v.select('description','cleanTokens.result').show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                       description|                                            result|
+--------------------------------------------------+--------------------------------------------------+
| Short sellers, Wall Street's dwindling band of...|[Short, sellers, Wall, Streets, dwindling, band...|
| Private investment firm Carlyle Group, which h...|[Private, investment, firm, Carlyle, Group, rep...|
| Soaring crude prices plus worries about the ec...|[Soaring, crude, prices, plus, worries, economy...|
| Authorities have halted oil export flows from ...|[Authorities, halted, oil, export, flows, main,...|
| Tearaway world oil prices, toppling records an...|[Tearaway, world, oil, prices, toppling, record...|
| Stocks ended slightly higher on Friday but sta...|[Stocks, ended, slightly, higher, Friday, staye...|
| Assets of the nation's retail money market mut...|[Assets, nat

## LogReg with Spark NLP Bert Embeddings

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

bert_embeddings = BertEmbeddings\
 .pretrained('bert_base_cased', 'en') \
 .setInputCols(["document",'cleanTokens'])\
 .setOutputCol("bert")\
 .setCaseSensitive(False)\
 .setPoolingLayer(0)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "bert"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_bert = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            bert_embeddings,
            embeddingsSentence,
            embeddings_finisher,
           label_stringIdx])

nlp_model_bert = nlp_pipeline_bert.fit(newsDF)

processed_bert = nlp_model_bert.transform(newsDF)

processed_bert.count()


bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


120000

In [None]:
from pyspark.sql.functions import explode

processed_bert= processed_bert.withColumn("features", explode(processed_bert.finished_sentence_embeddings))

processed_bert.select('description','features','label').show()


+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
|Srinagar, Nov 6 (...|[-0.0763546451926...|  2.0|
|France's presiden...|[0.01601043716073...|  2.0|
|President  Bush s...|[0.11258428543806...|  2.0|
|Established Shiit...|[0.09958435595035...|  2.0|
|While Democrats p...|[-0.3666543066501...|  2.0|
|Rural and deprive...|[0.08482994884252...|  1.0|
| Terrell Owens is...|[-0.1571628898382...|  3.0|
|" Gov. Ed Rendell...|[-0.0437468327581...|  3.0|
| A month after a ...|[-0.1684152632951...|  3.0|
| No Diana Taurasi...|[-0.0047841807827...|  3.0|
| An upbeat Presid...|[0.15349867939949...|  2.0|
| Gay and lesbian ...|[0.17594610154628...|  2.0|
| Twenty three peo...|[-0.0070635229349...|  2.0|
|  Connecticut Att...|[0.13604542613029...|  0.0|
|A new report on g...|[0.07444920390844...|  1.0|
|That Michael Siew...|[0.23243072628974...|  1.0|
|Vice chairman of ...|[-0.2215369194746...|  1.0|


In [None]:
# set seed for reproducibility
(trainingData, testData) = processed_bert.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84045
Test Dataset Count: 35955


In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

testData = testData.where(num_nonzeros("features") != 0)

In [None]:
predictions = lrModel.transform(testData)

predictions.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|The Securities and Exchange...|Business|[0.9967407593636138,0.00300...|  0.0|       0.0|
|Stocks opened higher today,...|Business|[0.9928207319563264,0.00469...|  0.0|       0.0|
| Retailer Payless ShoeSourc...|Business|[0.9926546087578139,0.00674...|  0.0|       0.0|
|The insurance brokerage rep...|Business|[0.9917833732987117,0.00754...|  0.0|       0.0|
|Shell outlined a profit str...|Business|[0.9916303454148256,0.00808...|  0.0|       0.0|
| Countrywide Financial Corp...|Business|[0.9916172364634749,0.00514...|  0.0|       0.0|
|PITTSBURGH Mellon Financial...|Business|[0.9915578428166462,0.00799...|  0.0|       0.0|
|  Grocery wholesaler Flemin...|Business|[0.9915445608575104,0.00766...|  0.0|       0.0|
|Mark Head

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

df = predictions.select('description','category','label','prediction').toPandas()

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.82      0.79      0.80      8911
         1.0       0.81      0.80      0.81      8972
         2.0       0.84      0.86      0.85      9008
         3.0       0.90      0.94      0.92      9063

    accuracy                           0.85     35954
   macro avg       0.84      0.85      0.84     35954
weighted avg       0.84      0.85      0.85     35954

0.8459142237303221


## LogReg with ELMO Embeddings

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

elmo_embeddings = ElmoEmbeddings.load('/Users/vkocaman/cache_pretrained/elmo_en_2.4.0_2.4_1580488815299')\
      .setPoolingLayer("word_emb")\
      .setInputCols(["document",'cleanTokens'])\
      .setOutputCol("elmo")

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "elmo"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")


nlp_pipeline_elmo = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            elmo_embeddings,
            embeddingsSentence,
            embeddings_finisher,
           label_stringIdx])

nlp_model_elmo = nlp_pipeline_elmo.fit(newsDF)

processed_elmo = nlp_model_elmo.transform(newsDF)

processed_elmo.count()


120000

In [None]:
(trainingData, testData) = newsDF.randomSplit([0.7, 0.3], seed = 100)

In [None]:
processed_trainingData = nlp_model_elmo.transform(trainingData)

processed_trainingData.count()

84045

In [None]:
processed_testData = nlp_model_elmo.transform(testData)

processed_testData.count()

35955

In [None]:
processed_trainingData.columns

['category',
 'description',
 'document',
 'token',
 'normalized',
 'cleanTokens',
 'elmo',
 'sentence_embeddings',
 'finished_sentence_embeddings',
 'label']

In [None]:


processed_testData= processed_testData.withColumn("features", explode(processed_testData.finished_sentence_embeddings))

processed_trainingData= processed_trainingData.withColumn("features", explode(processed_trainingData.finished_sentence_embeddings))


In [None]:
from pyspark.sql.functions import udf

@udf("long")
def num_nonzeros(v):
    return v.numNonzeros()

processed_testData = processed_testData.where(num_nonzeros("features") != 0)

In [None]:
%%time

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(processed_trainingData)


CPU times: user 97.4 ms, sys: 79.6 ms, total: 177 ms
Wall time: 15min 35s


In [None]:
processed_trainingData.columns

['category',
 'description',
 'document',
 'token',
 'normalized',
 'cleanTokens',
 'elmo',
 'sentence_embeddings',
 'finished_sentence_embeddings',
 'label',
 'features']

In [None]:

predictions = lrModel.transform(processed_testData)

predictions.select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|" Occidental Petroleum Corp...|Business|[0.9939238563124918,0.00317...|  0.0|       0.0|
|" Exxon Mobil Corp. &lt;A H...|Business|[0.9936105240300768,0.00421...|  0.0|       0.0|
| Falling oil prices and str...|   World|[0.9924582561860766,0.00703...|  2.0|       0.0|
|" U.S. Bancorp &lt;A HREF="...|Business|[0.9918036025020147,0.00491...|  0.0|       0.0|
|The convenience store chain...|Business|[0.98995176008916,0.0084863...|  0.0|       0.0|
|  Shares of Freddie Mac fel...|Business|[0.9899066353145851,0.00808...|  0.0|       0.0|
|Goldman Sachs Group Inc. on...|Business|[0.9887823628906515,0.00816...|  0.0|       0.0|
|index futures declined. Dow...|Business|[0.9887494817093566,0.00661...|  0.0|       0.0|
| Countryw

In [None]:
df = predictions.select('description','category','label','prediction').toPandas()

In [None]:
df.shape

(35954, 4)

In [None]:
df.head()

Unnamed: 0,description,category,label,prediction
0,A federal judge on Monday stayed his own rul...,Business,0.0,0.0
1,"A half dozen executives of Yukos, the embatt...",Business,0.0,0.0
2,A labor dispute may sideline professional ho...,Business,0.0,3.0
3,A ruling from the World Trade Organization c...,Business,0.0,0.0
4,American Airlines has unveiled a new simplif...,Business,0.0,0.0


In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.82      0.83      8911
         1.0       0.83      0.82      0.83      8972
         2.0       0.87      0.88      0.87      9008
         3.0       0.94      0.96      0.95      9063

    accuracy                           0.87     35954
   macro avg       0.87      0.87      0.87     35954
weighted avg       0.87      0.87      0.87     35954

0.8694164766090003


## LogReg with Universal Sentence Encoder

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")

In [None]:

useEmbeddings = UniversalSentenceEncoder.load('/Users/vkocaman/cache_pretrained/tfhub_use_en_2.4.0_2.4_1580582893733')\
      .setInputCols("document")\
      .setOutputCol("use_embeddings")

In [None]:

embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["use_embeddings"]) \
    .setOutputCols(["finished_use_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

use_pipeline = Pipeline(
      stages=[
        document_assembler,
          useEmbeddings,
        embeddings_finisher,
      label_stringIdx]
      )

In [None]:
use_df = use_pipeline.fit(newsDF).transform(newsDF)

In [None]:
use_df.select('finished_use_embeddings').show(3)

+-----------------------+
|finished_use_embeddings|
+-----------------------+
|   [[-0.029556609690...|
|   [[0.0133671779185...|
|   [[0.0291389804333...|
+-----------------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import explode

use_df= use_df.withColumn("features", explode(use_df.finished_use_embeddings))

In [None]:
use_df.show(2)

+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|category|         description|            document|      use_embeddings|finished_use_embeddings|label|            features|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|   World|Srinagar, Nov 6 (...|[[document, 0, 17...|[[sentence_embedd...|   [[-0.029556609690...|  2.0|[-0.0295566096901...|
|   World|France's presiden...|[[document, 0, 11...|[[sentence_embedd...|   [[0.0133671779185...|  2.0|[0.01336717791855...|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
only showing top 2 rows



In [None]:
# set seed for reproducibility
(trainingData, testData) = use_df.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+------------------------------+--------+------------------------------+-----+----------+
|                   description|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|ORRVILLE, Ohio - JM Smucker...|Business|[0.9846360253598295,0.01010...|  0.0|       0.0|
| Kroger Co. , the largest U...|Business|[0.9832865375880339,0.00933...|  0.0|       0.0|
|Fannie Mae, the largest US ...|Business|[0.9819043632044295,0.01021...|  0.0|       0.0|
|Financial services company ...|Business|[0.9815806438622043,0.00948...|  0.0|       0.0|
| The U.S. Securities and Ex...|Business|[0.9803398845434486,0.00881...|  0.0|       0.0|
|Albertsons Inc., the No. 2 ...|Business|[0.9801615716078368,0.01156...|  0.0|       0.0|
|NEW YORK : A better-than-ex...|Business|[0.980066826179653,0.008197...|  0.0|       0.0|
|" U.S. stocks ended lower o...|Business|[0.97990084173127,0.0139531...|  0.0|       0.0|
|SunTrust 

In [None]:
df = predictions.select('description','category','label','prediction').toPandas()
#df['result'] = df['result'].apply(lambda x: x[0])


In [None]:
df.head()

Unnamed: 0,description,category,label,prediction
0,A federal judge on Monday stayed his own rul...,Business,0.0,0.0
1,"A half dozen executives of Yukos, the embatt...",Business,0.0,0.0
2,A labor dispute may sideline professional ho...,Business,0.0,3.0
3,A ruling from the World Trade Organization c...,Business,0.0,0.0
4,American Airlines has unveiled a new simplif...,Business,0.0,0.0


In [None]:

print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83      8911
         1.0       0.85      0.84      0.84      8973
         2.0       0.89      0.89      0.89      9008
         3.0       0.95      0.97      0.96      9063

    accuracy                           0.88     35955
   macro avg       0.88      0.88      0.88     35955
weighted avg       0.88      0.88      0.88     35955

0.8831038798498123


### train on entire dataset

In [None]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(use_df)

In [None]:

test_df = spark.read.parquet("data/news_category_test.parquet")


In [None]:
test_df = use_pipeline.fit(test_df).transform(test_df)

In [None]:
test_df= test_df.withColumn("features", explode(test_df.finished_use_embeddings))

In [None]:
test_df.show(2)

+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|category|         description|            document|      use_embeddings|finished_use_embeddings|label|            features|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
|Business|Unions representi...|[[document, 0, 12...|[[sentence_embedd...|   [[0.0129975397139...|  1.0|[0.01299753971397...|
|Sci/Tech| TORONTO, Canada ...|[[document, 0, 22...|[[sentence_embedd...|   [[0.0019999044016...|  0.0|[0.00199990440160...|
+--------+--------------------+--------------------+--------------------+-----------------------+-----+--------------------+
only showing top 2 rows



In [None]:
predictions = lrModel.transform(test_df)

In [None]:
df = predictions.select('description','category','label','prediction').toPandas()

In [None]:
df['label'] = df.category.replace({'World':2.0,
                    'Sports':3.0,
                    'Business':0.0,
                    'Sci/Tech':1.0})

In [None]:
df.head()

Unnamed: 0,description,category,label,prediction
0,Unions representing workers at Turner Newall...,Business,0.0,0.0
1,"TORONTO, Canada A second team of rocketeer...",Sci/Tech,1.0,1.0
2,A company founded by a chemistry researcher a...,Sci/Tech,1.0,1.0
3,It's barely dawn when Mike Fitzpatrick starts...,Sci/Tech,1.0,1.0
4,Southern California's smog fighting agency we...,Sci/Tech,1.0,0.0


In [None]:
print(classification_report(df.label, df.prediction))
print(accuracy_score(df.label, df.prediction))

              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83      1900
         1.0       0.84      0.85      0.85      1900
         2.0       0.90      0.87      0.89      1900
         3.0       0.95      0.97      0.96      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600

0.8798684210526316


## Spark NLP Licensed DocClassifier

In [None]:
from sparknlp_jsl.annotator import *

In [None]:
# set seed for reproducibility
(trainingData, testData) = newsDF.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84076
Test Dataset Count: 35924


In [None]:

document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

logreg = DocumentLogRegClassifierApproach()\
      .setInputCols(["stem"])\
      .setLabelCol("category")\
      .setOutputCol("prediction")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
           logreg])

nlp_model = nlp_pipeline.fit(trainingData)

processed = nlp_model.transform(testData)

processed.count()

35923

In [None]:
processed.select('description','category','prediction.result').show(truncate=50)

+--------------------------------------------------+--------+----------+
|                                       description|category|    result|
+--------------------------------------------------+--------+----------+
|  In a city where terror attacks and a massive ...|Business|[Business]|
|  It sure isn #39;t the Goldilocks Economy of y...|Business|[Business]|
|, 8/30/2004. With 90 nanometer chips now on the...|Business|[Sci/Tech]|
|National Grid Transco, the Britain-based delive...|Business|[Business]|
| quot;A person who has been cheated is left in ...|Sci/Tech|[Sci/Tech]|
|" In its ongoing war with SCO over Linux and Un...|Sci/Tech|[Sci/Tech]|
|A bacteria-eating virus is the star of a new vi...|Sci/Tech|[Sci/Tech]|
|Birdman of Belair Mathew Tekulsky waxes on the ...|Sci/Tech|[Sci/Tech]|
|Computer maker sees to recover \$8.6 million in...|Sci/Tech|[Sci/Tech]|
|Hurricane Frances spared NASA #39;s depleted sh...|Sci/Tech|[Sci/Tech]|
|In a study, the now-public search engine out-ra...

In [None]:
processed.select('description','prediction.result').show(truncate=50)

+--------------------------------------------------+----------+
|                                       description|    result|
+--------------------------------------------------+----------+
|  In a city where terror attacks and a massive ...|[Business]|
|  It sure isn #39;t the Goldilocks Economy of y...|[Business]|
|, 8/30/2004. With 90 nanometer chips now on the...|[Sci/Tech]|
|National Grid Transco, the Britain-based delive...|[Business]|
| quot;A person who has been cheated is left in ...|[Sci/Tech]|
|" In its ongoing war with SCO over Linux and Un...|[Sci/Tech]|
|A bacteria-eating virus is the star of a new vi...|[Sci/Tech]|
|Birdman of Belair Mathew Tekulsky waxes on the ...|[Sci/Tech]|
|Computer maker sees to recover \$8.6 million in...|[Sci/Tech]|
|Hurricane Frances spared NASA #39;s depleted sh...|[Sci/Tech]|
|In a study, the now-public search engine out-ra...|[Sci/Tech]|
|New York, August 31: US technology executives a...|[Sci/Tech]|
|Ordinary mice can be turned into marath

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd

In [None]:
df = processed.select('description','category','prediction.result').toPandas()

In [None]:
df.head()

Unnamed: 0,description,category,result
0,In a city where terror attacks and a massive...,Business,[Business]
1,It sure isn #39;t the Goldilocks Economy of ...,Business,[Business]
2,", 8/30/2004. With 90 nanometer chips now on th...",Business,[Sci/Tech]
3,"National Grid Transco, the Britain-based deliv...",Business,[Business]
4,quot;A person who has been cheated is left in...,Sci/Tech,[Sci/Tech]


In [None]:
df.result[0][0]

'Business'

In [None]:
df = processed.select('description','category','prediction.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

In [None]:
df.head()

Unnamed: 0,description,category,result
0,In a city where terror attacks and a massive...,Business,Business
1,It sure isn #39;t the Goldilocks Economy of ...,Business,Business
2,", 8/30/2004. With 90 nanometer chips now on th...",Business,Sci/Tech
3,"National Grid Transco, the Britain-based deliv...",Business,Business
4,quot;A person who has been cheated is left in...,Sci/Tech,Sci/Tech


In [None]:

df = processed.select('description','category','prediction.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.82      0.82      0.82      8915
    Sci/Tech       0.83      0.83      0.83      9018
      Sports       0.94      0.93      0.93      9002
       World       0.86      0.86      0.86      8988

    accuracy                           0.86     35923
   macro avg       0.86      0.86      0.86     35923
weighted avg       0.86      0.86      0.86     35923

0.8612588035520419


# ClassifierDL

In [None]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.load('/Users/vkocaman/cache_pretrained/tfhub_use_en_2.4.4_2.4_1583158595769')\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [None]:
# set seed for reproducibility
(trainingData, testData) = newsDF.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 84045
Test Dataset Count: 35955


In [None]:
pipelineModel = pipeline.fit(trainingData)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

df = pipelineModel.transform(testDataset).select('category','description',"class.result").toPandas()

df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.85      0.84      0.85      8911
    Sci/Tech       0.85      0.87      0.86      8973
      Sports       0.95      0.98      0.97      9063
       World       0.92      0.88      0.90      9008

    accuracy                           0.89     35955
   macro avg       0.89      0.89      0.89     35955
weighted avg       0.89      0.89      0.89     35955

0.8930329578639966


## Loading the trained classifier from disk

In [None]:
classsifierdlmodel = ClassifierDLModel.load('classifierDL_model_20200317_5e')
 

In [None]:
import sparknlp
sparknlp.__path__

['/Users/vkocaman/anaconda3/lib/python3.7/site-packages/sparknlp']

In [None]:
 .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

In [None]:
trainDataset = spark.read \
      .option("header", True) \
      .csv("data/news_category_train.csv")

In [None]:
trainDataset.count()

120000

In [None]:
trainingData.count()

84045

In [None]:
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")


sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

use = UniversalSentenceEncoder.load('/Users/vkocaman/cache_pretrained/tfhub_use_en_2.4.4_2.4_1583158595769')\
 .setInputCols(["sentence"])\
 .setOutputCol("sentence_embeddings")

classsifierdlmodel = ClassifierDLModel.load('classifierDL_model_20200317_5e')

pipeline = Pipeline(
    stages = [
        document,
        sentence,
        use,
        classsifierdlmodel
    ])

In [None]:
pipeline.fit(testData.limit(1)).transform(testData.limit(10)).select('category','description',"class.result").show(10, truncate=50)

+--------+--------------------------------------------------+----------+
|category|                                       description|    result|
+--------+--------------------------------------------------+----------+
|Business|  A federal judge on Monday stayed his own ruli...|[Business]|
|Business|  A half dozen executives of Yukos, the embattl...|[Business]|
|Business|  A labor dispute may sideline professional hoc...|[Business]|
|Business|  A ruling from the World Trade Organization co...|[Business]|
|Business|  American Airlines has unveiled a new simplifi...|[Business]|
|Business|  Anglo Aussie miner BHP Billiton (BHP) (UK:BLT...|[Business]|
|Business|  Another group of investors hit beleaguered mo...|[Business]|
|Business|  At a sponsors' meeting of MIT Sloan School's ...|[Business]|
|Business|  Blockbuster Inc. wants to acquire rival Holly...|[Business]|
|Business|  Bolstered by investors, Oracle Corp. appears ...|[Business]|
+--------+-----------------------------------------

In [None]:
lm = LightPipeline(pipeline.fit(testDataset.limit(1)))
lm.annotate('In its first two years, the UK dedicated card companies have surge')

{'document': ['In its first two years, the UK dedicated card companies have surge'],
 'sentence_embeddings': ['In its first two years, the UK dedicated card companies have surge'],
 'class': ['Sci/Tech']}

In [None]:
text='''
Fearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.
'''

In [None]:
lm = LightPipeline(pipeline.fit(testDataset.limit(1)))

lm.annotate(text)

{'document': ['\nFearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions. As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets. Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.\n'],
 'sentence': ['Fearing the fate of Italy, the centre-right government has threatened to be merciless with those who flout tough restrictions.',
  'As of Wednesday it will also include all shops being closed across Greece, with the exception of supermarkets.',
  'Banks, pharmacies, pet-stores, mobile phone stores, opticians, bakers, mini-markets, couriers and food delivery outlets are among the few that will also be allowed to remain open.'],
 'sentence_embeddings': ['Fearing the fate of Italy, the centre-right government has threatened to be merciless wi

# Classifier DL + Glove + Basic text processing

In [None]:
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["token"]) \
    .setOutputCol("lemma")

lemma_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            lemma,
           glove_embeddings])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [None]:
lemma_pipeline.fit(trainingData.limit(1000)).transform(trainingData.limit(1000)).show(truncate=30)

+--------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
|category|                   description|                      document|                         token|                         lemma|                    embeddings|
+--------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
|Business|  #39;Tis the season to buy...|[[document, 0, 141,   #39;T...|[[token, 2, 8, #39;Tis, [se...|[[token, 2, 8, #39;Tis, [se...|[[word_embeddings, 2, 8, #3...|
|Business|  A Delaware judge rejected...|[[document, 0, 161,   A Del...|[[token, 2, 2, A, [sentence...|[[token, 2, 2, A, [sentence...|[[word_embeddings, 2, 2, A,...|
|Business|  A Food and Drug Administr...|[[document, 0, 140,   A Foo...|[[token, 2, 2, A, [sentence...|[[token, 2, 2, A, [sentence...|[[word_embeddings, 2, 2, A,...|
|Bus

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
 .setInputCols(["document",'lemma'])\
 .setOutputCol("embeddings")\
 .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(10)\
  .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
!rm -rf classifier_dl_pipeline_glove

In [None]:
clf_pipelineModel.save('classifier_dl_pipeline_glove')

In [None]:
clf_pipelineModel = clf_pipeline.fit(trainingData)

In [None]:
df = clf_pipelineModel.transform(testDataset).select('category','description',"class.result").toPandas()

df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))

print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.85      0.82      0.83      8911
    Sci/Tech       0.81      0.89      0.85      8973
      Sports       0.95      0.97      0.96      9063
       World       0.92      0.86      0.89      9008

    accuracy                           0.88     35955
   macro avg       0.88      0.88      0.88     35955
weighted avg       0.88      0.88      0.88     35955

0.8809066889167014


In [None]:
!cd data && ls -l

total 69352
drwxrwxr-x@  7 vkocaman  staff       224 Feb 25 22:04 [34mag_news_csv[m[m
-rw-------@  1 vkocaman  staff   9328727 Feb 28 13:14 news_Category.zip
drwxr-xr-x  16 vkocaman  staff       512 Feb 25 20:28 [34mnews_category.parquet[m[m
-rw-r--r--   1 vkocaman  staff   1504408 Feb 25 22:03 news_category_test.csv
drwxr-xr-x   6 vkocaman  staff       192 Feb 25 22:06 [34mnews_category_test.parquet[m[m
-rw-r--r--@  1 vkocaman  staff  24032125 Feb 28 11:58 news_category_train.csv


In [None]:
import pandas as pd
import

In [None]:
news_df = newsDF.toPandas()

In [None]:
news_df.head()

Unnamed: 0,category,description
0,World,"Srinagar, Nov 6 (UNI) Two militants and a Bord..."
1,World,France's president orders his forces to destro...
2,World,President Bush says he will reach out to alli...
3,World,Established Shiite parties and powerful upstar...
4,World,While Democrats placed their emphasis on the s...


In [None]:
news_df.to_csv('data/news_dataset.csv', index=False)

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("description") \
    .setOutputCol("document")
    
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
    
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained() \
 .setInputCols(["document",'lemma'])\
 .setOutputCol("embeddings")\
 .setCaseSensitive(False)

txt_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:

txt_pipelineModel = txt_pipeline.fit(testData.limit(1))

In [None]:
txt_pipelineModel.save('text_prep_pipeline_glove')

In [None]:
df.head()

Unnamed: 0,category,description,result
0,Business,A federal judge on Monday stayed his own rul...,Business
1,Business,"A half dozen executives of Yukos, the embatt...",Business
2,Business,A labor dispute may sideline professional ho...,Sports
3,Business,A ruling from the World Trade Organization c...,Sci/Tech
4,Business,American Airlines has unveiled a new simplif...,Business
