In [0]:
# todo remove non relevant locations, locations not associated with a city or country
# remove polarities above 1 and below -1, maybe round polarities to -1 or 1?

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# import natural language tool kit to help clean text
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer

test_file_path = "/FileStore/tables/07_2020.csv"

# Define Schema
schema = StructType([StructField('created_at', TimestampType(), True),
                     StructField('file_name', StringType(), True),
                     StructField('followers', IntegerType(), True),
                     StructField('friends', IntegerType(), True),
                     StructField('group_name', StringType(), True),
                     StructField('location', StringType(), True),
                     StructField('retweet_count', IntegerType(), True),
                     StructField('screenname', StringType(), True),
                     StructField('search_query', StringType(), True),
                     StructField('text', StringType(), True),
                     StructField('twitter_id', StringType(), True),
                     StructField('username', StringType(), True),
                     StructField('polarity', StringType(), True),                  
                     StructField('partition_0', StringType(), True),      
                     StructField('partition_1', StringType(), True)])

# must read polarity as a string then cast to float later after removing non numeric rows
df = spark.read.csv(test_file_path, header=True, schema=schema)#.repartition(200)
df = df.select("created_at", "group_name", "location", "text", "polarity", "partition_0", "partition_1")

print(df.count())

# remove non float type polarities
df_filtered = df.filter(col("polarity").cast(FloatType()).isNotNull()).select("created_at", "group_name", "location", "text", col("polarity").cast(FloatType()), "partition_0", "partition_1")

# remove polarities above 1 and below -1
df_filtered = df_filtered.filter("polarity < 1 and polarity > -1")

# convert polarities to 3 classes (0, 1, 3, negative, neutral, positive)
udf_polarity_threshold = udf(lambda x: 0 if (x < -0.1) else (1 if (x <= 0.1) else 2), IntegerType())
df_filtered = df_filtered.withColumn("polarity_class", udf_polarity_threshold("polarity"))
display(df_filtered)

# remove links
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text, r'http\S+', ''))

# remove RT twitter lingo for retweet which should have no impact on sentiment
# df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, r'RT\s+', ""))

# remove all characters except alphabetic ones
# replace ' with nothing to make sure contractions are not split
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, "\'", ''))
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, '[^a-zA-Z\s]', ' '))

# group whitespace
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, '\s+', ' '))

# tokenize the text into words
df_filtered = Tokenizer(inputCol='text_cleaned', outputCol='words').transform(df_filtered)

# remove stopwords and 'rt' (rt is twitter lingo for retweet, has no imapct on text sentiment)
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.append("rt")
udf_remove_stop = udf(lambda x: [i for i in x if not i.lower() in stop_words], ArrayType(StringType()))
df_filtered = df_filtered.withColumn("words_cleaned", udf_remove_stop("words"))

# convert words to stems
stemmer = PorterStemmer()
udf_stem = udf(lambda x: [stemmer.stem(i) for i in x], ArrayType(StringType()))
df_filtered = df_filtered.withColumn("words_stem", udf_stem("words_cleaned"))

# limit to 1000 for now until i can figure out issues with memory
df_filtered = df_filtered.limit(200000)
display(df_filtered)

df_filtered = df_filtered.select("created_at", "group_name", "location", "text", "polarity_class", "partition_0", "partition_1", "words_stem")
df_filtered.printSchema()


1242243


created_at,group_name,location,text,polarity,partition_0,partition_1,polarity_class
2020-07-12T03:45:47.000+0000,Japan,,RT @Streetcar_honda: Cr. Owner : @L2PJapanTAKERU #Honda #Civic #FD2 #MugenRR #l2pjapan #Japan https://t.co/1WNFsNvNPf,0.0,Politics,Japan,1
2020-07-12T03:44:41.000+0000,Japan,"Florida, USA",RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,0.1779,Politics,Japan,2
2020-07-12T03:44:13.000+0000,Japan,,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,0.1779,Politics,Japan,2
2020-07-12T03:43:32.000+0000,Japan,,#since2008 #tobebeautifu #tatioactivedx #tatio #tatio #shape #shapeslimming #softgel #sofrgelcapsules #fda… https://t.co/iZz0ERHA7l,0.0,Politics,Japan,1
2020-07-12T03:43:16.000+0000,Japan,for now here,#HIROSHIMA : THE NEXT DAY https://t.co/1dsmMEILXm #atomic #bomb #nuclear #japan #history,0.0,Politics,Japan,1
2020-07-12T03:42:12.000+0000,Japan,,RT @Nuke_Info: Regulator demands #TEPCO clarify responsibilities | NHK WORLD-#JAPAN News https://t.co/PQTg4SbQ8k,0.0,Politics,Japan,1
2020-07-12T03:41:55.000+0000,Japan,Hell,RT @AlArabiya_Eng: Watch: The former #Nissan boss Carlos #Ghosn shares new details on his daring escape from #Japan while under close surve…,0.6597,Politics,Japan,2
2020-07-12T03:41:49.000+0000,Japan,,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,Politics,Japan,1
2020-07-12T03:41:47.000+0000,Japan,,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,Politics,Japan,1
2020-07-12T03:41:40.000+0000,Japan,Earth,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,Politics,Japan,1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


created_at,group_name,location,text,polarity,partition_0,partition_1,polarity_class,text_cleaned,words,words_cleaned,words_stem
2020-07-12T03:45:47.000+0000,Japan,,RT @Streetcar_honda: Cr. Owner : @L2PJapanTAKERU #Honda #Civic #FD2 #MugenRR #l2pjapan #Japan https://t.co/1WNFsNvNPf,0.0,Politics,Japan,1,RT Streetcar honda Cr Owner L PJapanTAKERU Honda Civic FD MugenRR l pjapan Japan,"List(rt, streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)","List(streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)","List(streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)"
2020-07-12T03:44:41.000+0000,Japan,"Florida, USA",RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,0.1779,Politics,Japan,2,RT KennethWHarmon Available to pre order on Amazon HistoricalFiction Japan WorldWar MagicalRealism Romance,"List(rt, kennethwharmon, available, to, pre, order, on, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, available, pre, order, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:44:13.000+0000,Japan,,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,0.1779,Politics,Japan,2,RT KennethWHarmon Available to pre order on Amazon HistoricalFiction Japan WorldWar MagicalRealism Romance,"List(rt, kennethwharmon, available, to, pre, order, on, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, available, pre, order, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:43:32.000+0000,Japan,,#since2008 #tobebeautifu #tatioactivedx #tatio #tatio #shape #shapeslimming #softgel #sofrgelcapsules #fda… https://t.co/iZz0ERHA7l,0.0,Politics,Japan,1,since tobebeautifu tatioactivedx tatio tatio shape shapeslimming softgel sofrgelcapsules fda,"List(, since, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslimming, softgel, sofrgelcapsules, fda)","List(, since, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslimming, softgel, sofrgelcapsules, fda)","List(, sinc, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslim, softgel, sofrgelcapsul, fda)"
2020-07-12T03:43:16.000+0000,Japan,for now here,#HIROSHIMA : THE NEXT DAY https://t.co/1dsmMEILXm #atomic #bomb #nuclear #japan #history,0.0,Politics,Japan,1,HIROSHIMA THE NEXT DAY atomic bomb nuclear japan history,"List(, hiroshima, the, next, day, atomic, bomb, nuclear, japan, history)","List(, hiroshima, next, day, atomic, bomb, nuclear, japan, history)","List(, hiroshima, next, day, atom, bomb, nuclear, japan, histori)"
2020-07-12T03:42:12.000+0000,Japan,,RT @Nuke_Info: Regulator demands #TEPCO clarify responsibilities | NHK WORLD-#JAPAN News https://t.co/PQTg4SbQ8k,0.0,Politics,Japan,1,RT Nuke Info Regulator demands TEPCO clarify responsibilities NHK WORLD JAPAN News,"List(rt, nuke, info, regulator, demands, tepco, clarify, responsibilities, nhk, world, japan, news)","List(nuke, info, regulator, demands, tepco, clarify, responsibilities, nhk, world, japan, news)","List(nuke, info, regul, demand, tepco, clarifi, respons, nhk, world, japan, news)"
2020-07-12T03:41:55.000+0000,Japan,Hell,RT @AlArabiya_Eng: Watch: The former #Nissan boss Carlos #Ghosn shares new details on his daring escape from #Japan while under close surve…,0.6597,Politics,Japan,2,RT AlArabiya Eng Watch The former Nissan boss Carlos Ghosn shares new details on his daring escape from Japan while under close surve,"List(rt, alarabiya, eng, watch, the, former, nissan, boss, carlos, ghosn, shares, new, details, on, his, daring, escape, from, japan, while, under, close, surve)","List(alarabiya, eng, watch, former, nissan, boss, carlos, ghosn, shares, new, details, daring, escape, japan, close, surve)","List(alarabiya, eng, watch, former, nissan, boss, carlo, ghosn, share, new, detail, dare, escap, japan, close, surv)"
2020-07-12T03:41:49.000+0000,Japan,,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,Politics,Japan,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:47.000+0000,Japan,,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,Politics,Japan,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:40.000+0000,Japan,Earth,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,Politics,Japan,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"


root
 |-- created_at: timestamp (nullable = true)
 |-- group_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- partition_0: string (nullable = true)
 |-- partition_1: string (nullable = true)
 |-- words_stem: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
# https://www.analyticsvidhya.com/blog/2021/07/performing-sentiment-analysis-with-naive-bayes-classifier/
!pip install textblob
!pip install ntlk
import nltk

nltk.download(["names","stopwords","state_union","twitter_samples","movie_reviews","averaged_perceptron_tagger","vader_lexicon","punkt", ])

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
[31mERROR: Could not find a version that satisfies the requirement ntlk[0m
[31mERROR: No matching distribution found for ntlk[0m
You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading pa

In [0]:
# Convert text column to array of strings
#from pyspark.sql.functions import split, col
#df_filtered = df_filtered.select(split(col("text")," ").alias("text"), "polarity")
df_filtered.printSchema()
#display(df_filtered)

trainDF, testDF = df_filtered.randomSplit([.2, .8], seed=42)

print("Train row count: ", trainDF.count())
print("Test row count: ", testDF.count())

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()
cv.setInputCol("words_stem")
cv.setOutputCol("vectorized_text")



root
 |-- created_at: timestamp (nullable = true)
 |-- group_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- partition_0: string (nullable = true)
 |-- partition_1: string (nullable = true)
 |-- words_stem: array (nullable = true)
 |    |-- element: string (containsNull = true)

Train row count:  40132
Test row count:  159868
Out[4]: CountVectorizer_233b9a68d4bb

In [0]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(modelType="multinomial", featuresCol="vectorized_text", labelCol="polarity_class")

In [0]:

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[cv, nb])
pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
display(predDF.select("text", "polarity_class", "prediction"))
predDF = predDF.select("text", "polarity_class", col("prediction").cast(DoubleType()))
predDF.printSchema()

text,polarity_class,prediction
RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,2.0
RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,2.0
"RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,0.0
RT @HongKongFP: Security law: White House may target Hong Kong dollar peg as punitive response – report https://t.co/3nVEY616F8 #hongkong…,0,0.0
"RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,0.0
"""RT @revmahoney: We are calling upon President Trump to IMMEDIATELY sign the """"Hong Kong Autonomy Act."""" This legislation passed unanimously t…""",1,2.0
"RT @simjhenderson: 1. With the passage of the #NationalSecurityLaw in #HongKong, #Australia should immediately terminate the Australia-Hong…",1,1.0
RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,2.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,0.0


root
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- prediction: double (nullable = false)



In [0]:
from sklearn.metrics import accuracy_score
import numpy as np
print("accuracy: ", accuracy_score(np.array(testDF.select("polarity_class").collect()), predDF.select("prediction").collect()))

accuracy:  0.8007793929992244


In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndTarget = predDF.select("polarity_class", "prediction")
metrics = MulticlassMetrics(predictionAndTarget.rdd.map(tuple))

# Summary stats
print("Accuracy = %s" % metrics.accuracy)



[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
[0;32m<command-2187533659011319>[0m in [0;36m<module>[0;34m[0m
[1;32m      5[0m [0;34m[0m[0m
[1;32m      6[0m [0;31m# Summary stats[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 7[0;31m [0mprint[0m[0;34m([0m[0;34m"Accuracy = %s"[0m [0;34m%[0m [0mmetrics[0m[0;34m.[0m[0maccuracy[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/mllib/evaluation.py[0m in [0;36maccuracy[0;34m(self)[0m
[1;32m    334[0m         out of the total number of instances).
[1;32m    335[0m         """
[0;32m--> 336[0;31m         [0;32mreturn[0m [0mself[0m[0;34m.[0m[0mcall[0m[0;34m([0m[0;34m"accuracy"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m    337[0m [0;34m[0m[0m
[1;32m    338[0m     [0;34m@[0m[0mproperty[0m[0;34m[0m[0;34m[0m

In [0]:
cv_model = cv.fit(trainDF)
cv_df = cv_model.transform(trainDF)
display(cv_df)

print("cv_df row count: ", cv_df.count())

created_at,group_name,location,text,polarity_class,partition_0,partition_1,words_stem,vectorized_text
2020-07-08T15:57:58.000+0000,Hong Kong,"Osaka City Chuo Ward, Osaka","RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,Politics,HongKong,"List(senat, patrick, given, china, assault, democraci, hongkong, australia, give, notic, termin, extradit, agreement)","Map(vectorType -> sparse, length -> 51692, indices -> List(10, 36, 166, 296, 377, 607, 821, 1045, 1333, 1551, 1838, 3602, 5354), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T15:58:39.000+0000,Hong Kong,,RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,Politics,HongKong,"List(peterkhalilmp, given, hong, konger, amp, other, subject, beij, new, nation, secur, law, overrid, independ)","Map(vectorType -> sparse, length -> 51692, indices -> List(1, 5, 41, 94, 101, 157, 591, 709, 730, 1045, 2032, 5534, 6283, 6471), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T15:59:05.000+0000,Hong Kong,,"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,Politics,HongKong,"List(, amfchina, german, chancellor, grow, pressur, cut, germani, tie, beij, hong, kong, crisi, trigger, backla)","Map(vectorType -> sparse, length -> 51692, indices -> List(0, 23, 101, 108, 441, 494, 558, 709, 978, 1276, 1972, 2044, 4838, 7639, 7976), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T16:00:35.000+0000,Hong Kong,Moon 🌕,RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,Politics,HongKong,"List(peterkhalilmp, given, hong, konger, amp, other, subject, beij, new, nation, secur, law, overrid, independ)","Map(vectorType -> sparse, length -> 51692, indices -> List(1, 5, 41, 94, 101, 157, 591, 709, 730, 1045, 2032, 5534, 6283, 6471), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T16:01:12.000+0000,Hong Kong,,RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,Politics,HongKong,"List(peterkhalilmp, given, hong, konger, amp, other, subject, beij, new, nation, secur, law, overrid, independ)","Map(vectorType -> sparse, length -> 51692, indices -> List(1, 5, 41, 94, 101, 157, 591, 709, 730, 1045, 2032, 5534, 6283, 6471), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T16:01:16.000+0000,Hong Kong,,"RT @RepTedYoho: As the #CCP tightens its grip on the people of #HongKong, the world must hold them accountable. It is time to sign the Ho…",1,Politics,HongKong,"List(reptedyoho, ccp, tighten, grip, peopl, hongkong, world, must, hold, account, time, sign, ho)","Map(vectorType -> sparse, length -> 51692, indices -> List(7, 22, 27, 36, 113, 159, 541, 552, 675, 1517, 2611, 4398, 4442), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T16:02:48.000+0000,Hong Kong,,RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,Politics,HongKong,"List(peterkhalilmp, given, hong, konger, amp, other, subject, beij, new, nation, secur, law, overrid, independ)","Map(vectorType -> sparse, length -> 51692, indices -> List(1, 5, 41, 94, 101, 157, 591, 709, 730, 1045, 2032, 5534, 6283, 6471), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T16:03:05.000+0000,Hong Kong,,"RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,Politics,HongKong,"List(senat, patrick, given, china, assault, democraci, hongkong, australia, give, notic, termin, extradit, agreement)","Map(vectorType -> sparse, length -> 51692, indices -> List(10, 36, 166, 296, 377, 607, 821, 1045, 1333, 1551, 1838, 3602, 5354), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T16:04:07.000+0000,Hong Kong,,RT @darth_ringo: @smithmarion My grandfather took my father to #HongKong just to escape Cultural Revolution. The UK opened the Hong Kong bo…,2,Politics,HongKong,"List(darth, ringo, smithmarion, grandfath, took, father, hongkong, escap, cultur, revolut, uk, open, hong, kong, bo)","Map(vectorType -> sparse, length -> 51692, indices -> List(17, 36, 101, 108, 181, 470, 847, 1987, 2005, 2189, 2302, 15611, 30515, 37349, 50518), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
2020-07-08T16:05:02.000+0000,Hong Kong,"New York, NY","“Schools must not allow students to play, sing, or broadcast it [Glory to Hong Kong] in schools.” #HongKong author… https://t.co/CO57L9Dk26",2,Politics,HongKong,"List(, school, must, allow, student, play, sing, broadcast, glori, hong, kong, school, hongkong, author)","Map(vectorType -> sparse, length -> 51692, indices -> List(0, 36, 101, 108, 159, 261, 322, 361, 387, 650, 1667, 3280, 4813), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0))"


cv_df row count:  40132


In [0]:
nb_model = nb.fit(cv_df)
nb_df = nb_model.transform(cv_df)
display(nb_df)
