In [0]:
# todo remove non relevant locations, locations not associated with a city or country
# remove polarities above 1 and below -1, maybe round polarities to -1 or 1?

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# import natural language tool kit to help clean text
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer

test_file_path = "/FileStore/tables/07_2020.csv"

# Define Schema
schema = StructType([StructField('created_at', TimestampType(), True),
                     StructField('file_name', StringType(), True),
                     StructField('followers', IntegerType(), True),
                     StructField('friends', IntegerType(), True),
                     StructField('group_name', StringType(), True),
                     StructField('location', StringType(), True),
                     StructField('retweet_count', IntegerType(), True),
                     StructField('screenname', StringType(), True),
                     StructField('search_query', StringType(), True),
                     StructField('text', StringType(), True),
                     StructField('twitter_id', StringType(), True),
                     StructField('username', StringType(), True),
                     StructField('polarity', StringType(), True),                  
                     StructField('partition_0', StringType(), True),      
                     StructField('partition_1', StringType(), True)])

# must read polarity as a string then cast to float later after removing non numeric rows
df = spark.read.csv(test_file_path, header=True, schema=schema)
df = df.select("created_at", "search_query", "text", "polarity")

print(df.count())

# remove non float type polarities
df_filtered = df.filter(col("polarity").cast(FloatType()).isNotNull()).select("created_at", "search_query", "text", col("polarity").cast(FloatType()))

# remove polarities above 1 and below -1
df_filtered = df_filtered.filter("polarity < 1 and polarity > -1")

# convert polarities to 3 classes (0, 1, 3, negative, neutral, positive)
udf_polarity_threshold = udf(lambda x: 0 if (x < -0.1) else (1 if (x <= 0.1) else 2), IntegerType())
df_filtered = df_filtered.withColumn("polarity_class", udf_polarity_threshold("polarity"))

# remove links
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text, r'http\S+', ''))

# remove all characters except alphabetic ones
# replace ' with nothing to make sure contractions are not split
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, "\'", ''))
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, '[^a-zA-Z\s]', ' '))

# group whitespace
df_filtered = df_filtered.withColumn('text_cleaned', regexp_replace(df_filtered.text_cleaned, '\s+', ' '))

# tokenize the text into words
df_filtered = Tokenizer(inputCol='text_cleaned', outputCol='words').transform(df_filtered)

# remove stopwords and 'rt' (rt is twitter lingo for retweet, has no imapct on text sentiment)
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.append("rt")
udf_remove_stop = udf(lambda x: [i for i in x if not i.lower() in stop_words], ArrayType(StringType()))
df_filtered = df_filtered.withColumn("words_cleaned", udf_remove_stop("words"))

# convert words to stems
stemmer = PorterStemmer()
udf_stem = udf(lambda x: [stemmer.stem(i) for i in x], ArrayType(StringType()))
df_filtered = df_filtered.withColumn("words_stem", udf_stem("words_cleaned"))


display(df_filtered)

df_filtered = df_filtered.select("created_at", "search_query", "text", "polarity_class", "words_stem")

display(df_filtered)
df_filtered.printSchema()


1242243
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
root
 |-- created_at: timestamp (nullable = true)
 |-- search_query: string (nullable = true)
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- words_stem: array (nullable = true)
 |    |-- element: string (containsNull = true)



created_at,search_query,text,polarity,polarity_class,text_cleaned,words,words_cleaned,words_stem
2020-07-12T03:45:47.000+0000,#Japan,RT @Streetcar_honda: Cr. Owner : @L2PJapanTAKERU #Honda #Civic #FD2 #MugenRR #l2pjapan #Japan https://t.co/1WNFsNvNPf,0.0,1,RT Streetcar honda Cr Owner L PJapanTAKERU Honda Civic FD MugenRR l pjapan Japan,"List(rt, streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)","List(streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)","List(streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)"
2020-07-12T03:44:41.000+0000,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,0.1779,2,RT KennethWHarmon Available to pre order on Amazon HistoricalFiction Japan WorldWar MagicalRealism Romance,"List(rt, kennethwharmon, available, to, pre, order, on, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, available, pre, order, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:44:13.000+0000,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,0.1779,2,RT KennethWHarmon Available to pre order on Amazon HistoricalFiction Japan WorldWar MagicalRealism Romance,"List(rt, kennethwharmon, available, to, pre, order, on, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, available, pre, order, amazon, historicalfiction, japan, worldwar, magicalrealism, romance)","List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:43:32.000+0000,#Japan,#since2008 #tobebeautifu #tatioactivedx #tatio #tatio #shape #shapeslimming #softgel #sofrgelcapsules #fda… https://t.co/iZz0ERHA7l,0.0,1,since tobebeautifu tatioactivedx tatio tatio shape shapeslimming softgel sofrgelcapsules fda,"List(, since, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslimming, softgel, sofrgelcapsules, fda)","List(, since, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslimming, softgel, sofrgelcapsules, fda)","List(, sinc, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslim, softgel, sofrgelcapsul, fda)"
2020-07-12T03:43:16.000+0000,#Japan,#HIROSHIMA : THE NEXT DAY https://t.co/1dsmMEILXm #atomic #bomb #nuclear #japan #history,0.0,1,HIROSHIMA THE NEXT DAY atomic bomb nuclear japan history,"List(, hiroshima, the, next, day, atomic, bomb, nuclear, japan, history)","List(, hiroshima, next, day, atomic, bomb, nuclear, japan, history)","List(, hiroshima, next, day, atom, bomb, nuclear, japan, histori)"
2020-07-12T03:42:12.000+0000,#Japan,RT @Nuke_Info: Regulator demands #TEPCO clarify responsibilities | NHK WORLD-#JAPAN News https://t.co/PQTg4SbQ8k,0.0,1,RT Nuke Info Regulator demands TEPCO clarify responsibilities NHK WORLD JAPAN News,"List(rt, nuke, info, regulator, demands, tepco, clarify, responsibilities, nhk, world, japan, news)","List(nuke, info, regulator, demands, tepco, clarify, responsibilities, nhk, world, japan, news)","List(nuke, info, regul, demand, tepco, clarifi, respons, nhk, world, japan, news)"
2020-07-12T03:41:55.000+0000,#Japan,RT @AlArabiya_Eng: Watch: The former #Nissan boss Carlos #Ghosn shares new details on his daring escape from #Japan while under close surve…,0.6597,2,RT AlArabiya Eng Watch The former Nissan boss Carlos Ghosn shares new details on his daring escape from Japan while under close surve,"List(rt, alarabiya, eng, watch, the, former, nissan, boss, carlos, ghosn, shares, new, details, on, his, daring, escape, from, japan, while, under, close, surve)","List(alarabiya, eng, watch, former, nissan, boss, carlos, ghosn, shares, new, details, daring, escape, japan, close, surve)","List(alarabiya, eng, watch, former, nissan, boss, carlo, ghosn, share, new, detail, dare, escap, japan, close, surv)"
2020-07-12T03:41:49.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:47.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:40.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",0.0,1,RT nirvana core Almost of YouTube s site traffic comes from the U S According to Alexa s estimates YouTube viewers are most likely,"List(rt, nirvana, core, almost, of, youtube, s, site, traffic, comes, from, the, u, s, according, to, alexa, s, estimates, youtube, viewers, are, most, likely)","List(nirvana, core, almost, youtube, site, traffic, comes, u, according, alexa, estimates, youtube, viewers, likely)","List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"


created_at,search_query,text,polarity_class,words_stem
2020-07-12T03:45:47.000+0000,#Japan,RT @Streetcar_honda: Cr. Owner : @L2PJapanTAKERU #Honda #Civic #FD2 #MugenRR #l2pjapan #Japan https://t.co/1WNFsNvNPf,1,"List(streetcar, honda, cr, owner, l, pjapantakeru, honda, civic, fd, mugenrr, l, pjapan, japan)"
2020-07-12T03:44:41.000+0000,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,2,"List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:44:13.000+0000,#Japan,RT @KennethWHarmon: Available to pre-order on Amazon. #HistoricalFiction #Japan #WorldWar2 #MagicalRealism #Romance https://t.co/yyU9ECG…,2,"List(kennethwharmon, avail, pre, order, amazon, historicalfict, japan, worldwar, magicalr, romanc)"
2020-07-12T03:43:32.000+0000,#Japan,#since2008 #tobebeautifu #tatioactivedx #tatio #tatio #shape #shapeslimming #softgel #sofrgelcapsules #fda… https://t.co/iZz0ERHA7l,1,"List(, sinc, tobebeautifu, tatioactivedx, tatio, tatio, shape, shapeslim, softgel, sofrgelcapsul, fda)"
2020-07-12T03:43:16.000+0000,#Japan,#HIROSHIMA : THE NEXT DAY https://t.co/1dsmMEILXm #atomic #bomb #nuclear #japan #history,1,"List(, hiroshima, next, day, atom, bomb, nuclear, japan, histori)"
2020-07-12T03:42:12.000+0000,#Japan,RT @Nuke_Info: Regulator demands #TEPCO clarify responsibilities | NHK WORLD-#JAPAN News https://t.co/PQTg4SbQ8k,1,"List(nuke, info, regul, demand, tepco, clarifi, respons, nhk, world, japan, news)"
2020-07-12T03:41:55.000+0000,#Japan,RT @AlArabiya_Eng: Watch: The former #Nissan boss Carlos #Ghosn shares new details on his daring escape from #Japan while under close surve…,2,"List(alarabiya, eng, watch, former, nissan, boss, carlo, ghosn, share, new, detail, dare, escap, japan, close, surv)"
2020-07-12T03:41:49.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1,"List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:47.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1,"List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"
2020-07-12T03:41:40.000+0000,#Japan,"RT @nirvana_core: Almost 15% of YouTube’s site traffic comes from the U.S. According to Alexa’s estimates, YouTube viewers are most likely…",1,"List(nirvana, core, almost, youtub, site, traffic, come, u, accord, alexa, estim, youtub, viewer, like)"


In [0]:
# Split into test and train
df_filtered.printSchema()

trainDF, testDF = df_filtered.randomSplit([.8, .2], seed=42)

root
 |-- created_at: timestamp (nullable = true)
 |-- search_query: string (nullable = true)
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- words_stem: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
# Set up pipeline components

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()
cv.setInputCol("words_stem")
cv.setOutputCol("vectorized_text")

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(modelType="multinomial", featuresCol="vectorized_text", labelCol="polarity_class")

In [0]:

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[cv, nb])
pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
display(predDF.select("text", "polarity_class", "prediction"))
predDF = predDF.select("text", "polarity_class", col("prediction").cast(DoubleType()))
predDF.printSchema()

root
 |-- text: string (nullable = true)
 |-- polarity_class: integer (nullable = true)
 |-- prediction: double (nullable = false)



text,polarity_class,prediction
"RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,0.0
"RT @simjhenderson: 1. With the passage of the #NationalSecurityLaw in #HongKong, #Australia should immediately terminate the Australia-Hong…",1,2.0
RT @PeterKhalilMP: 1/2 Given that Hong Kongers & others are now subject to Beijing’s new national security laws that override the independe…,2,2.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"RT @Senator_Patrick: Given #China's assault on democracy in #HongKong, Australia should give notice to terminate our extradition agreement…",0,0.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"""RT @AMFChina: """"The German Chancellor is under growing pressure to cut Germany's ties with Beijing as the Hong Kong crisis triggers a backla…""",0,0.0
"RT @alphacentauriii: Pro-#HongKong democracy students in Perth allege death threats, intimidation from Chinese nationals. The Australian Fe…",0,0.0
TikTok to leave Hong Kong market ‘within days’. It was stated by #TikTok that it will quit the Hong Kong market aft… https://t.co/YLd022lYxw,1,0.0


In [0]:
from sklearn.metrics import accuracy_score
import numpy as np
print("accuracy: ", accuracy_score(np.array(testDF.select("polarity_class").collect()), predDF.select("prediction").collect()))

accuracy:  0.8321318735391807


In [0]:
df2 = spark.read.option("wholeFile", True)\
                .option("multiline", True)\
                .option("header", True)\
                .option("escape","\"")\
                .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")\
                .csv("/FileStore/tables/India_cricket.csv")\

df2 = df2.dropna()
display(df2)


text,created_at,search_query
Inside Edge Season 3 Review | Judo's Dhamaka #Youtube #Link https://t.co/Ni9j1xJv48 #InsideEdge3 #VivekOberoi #InsideEdge #FarhanAkhtar #insideedge3review #richachaddha #Cricket #90smusic #ott #India #InsideEdgeOnPrime #AmazonPrimeVideo #PrimeVideo #testcricket https://t.co/nxlHVyWvn4,2021-12-06T23:09:40.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
#India is back to the No.1 spot in #ICC Men’s #Test Team Rankings #ymedia #cricket @ICC @BCCI @YudhvirJaswal https://t.co/df61PrryXH,2021-12-06T22:55:07.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
CSA confirms revised India tour schedule. 1st test will be held Dec 26 https://t.co/U94tJTWB3n #BCCI #confirms #Cricket #ICC #India Read Full 👇,2021-12-06T21:45:03.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
Twitter Celebrities Reactions On India Test Series Win Against New Zealand 🏏🇮🇳🇳🇿 Watch - https://t.co/AyYRyJvGe7 #Cricket #India #INDvNZ #NZvIND #INDvsNZ #NZvsIND #ViratKohli #CricketTwitter #AjazPatel #BCCI #10wickets #Ashwin #AskStar #Ashes #BANvPAK #BabarAzam #BBL11 #Dhoni https://t.co/6W4xxgadCk,2021-12-06T19:45:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
Tests: 50 wins from 97 matches ODIs: 153 wins in 254 matches T20Is: 59 wins in 95 matches Virat Kohli - the first player in international cricket to secure 50 or more wins in each of the three formats 🔥🔥 #ViratKohli #India #INDvNZ #Cricket https://t.co/nGtemEqVCw,2021-12-06T18:30:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
Mumbai: Indian players came up with a brilliant gesture to honour New Zealand spinner Ajaz Patel after his historic ten-wicket haul in the second test. Read more below! #india #Mumbai #players #match #cricket #NewZealand #AjazPatel #OmicronVirus #vaccine #wicket #UniCreds https://t.co/ZHM70lgd80,2021-12-06T17:58:43.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
SA Vs IND: Schedule for India's tour of South Africa revised - @OfficialCSA @BCCI #SouthAfrica #India #Cricket #Sports #SAvsIND Check details- https://t.co/6NjtFWv5Ie,2021-12-06T17:30:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
CSA confirms revised India tour schedule. 1st test will be held Dec 26 https://t.co/MyxZGcU0VU #Bcci #Confirms #Cricket #India #Oppo,2021-12-06T17:20:32.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
One step ahead to world championship #BCCI #Cricket #india 👍👍👍👍 https://t.co/17sKZjOFrb,2021-12-06T17:13:27.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets
#INDvsSA @OfficialCSA revise tour schedule ! @BCCI @ICC #IndvsSA #India #SouthAfrica #INDvsSATest #INDvsSATestSeries #INDvsSATest2021 #INDvsSASeries #Cricket https://t.co/8CWlvPnA2i,2021-12-06T16:37:55.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets


In [0]:
# remove links
df2 = df2.withColumn('text_cleaned', regexp_replace(df2.text, r'http\S+', ''))

# remove all characters except alphabetic ones
# replace ' with nothing to make sure contractions are not split
df2 = df2.withColumn('text_cleaned', regexp_replace(df2.text_cleaned, "\'", ''))
df2 = df2.withColumn('text_cleaned', regexp_replace(df2.text_cleaned, '[^a-zA-Z\s]', ' '))

# group whitespace
df2 = df2.withColumn('text_cleaned', regexp_replace(df2.text_cleaned, '\s+', ' '))

# tokenize the text into words
df2 = Tokenizer(inputCol='text_cleaned', outputCol='words').transform(df2)

# remove stopwords and 'rt' (rt is twitter lingo for retweet, has no imapct on text sentiment)
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.append("rt")
udf_remove_stop = udf(lambda x: [i for i in x if not i.lower() in stop_words], ArrayType(StringType()))
df2 = df2.withColumn("words_cleaned", udf_remove_stop("words"))

# convert words to stems
stemmer = PorterStemmer()
udf_stem = udf(lambda x: [stemmer.stem(i) for i in x], ArrayType(StringType()))
df2 = df2.withColumn("words_stem", udf_stem("words_cleaned"))


display(df2)

df2 = df2.select("created_at", "search_query", "text", "words_stem")

display(df2)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


text,created_at,search_query,text_cleaned,words,words_cleaned,words_stem
Inside Edge Season 3 Review | Judo's Dhamaka #Youtube #Link https://t.co/Ni9j1xJv48 #InsideEdge3 #VivekOberoi #InsideEdge #FarhanAkhtar #insideedge3review #richachaddha #Cricket #90smusic #ott #India #InsideEdgeOnPrime #AmazonPrimeVideo #PrimeVideo #testcricket https://t.co/nxlHVyWvn4,2021-12-06T23:09:40.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,Inside Edge Season Review Judos Dhamaka Youtube Link InsideEdge VivekOberoi InsideEdge FarhanAkhtar insideedge review richachaddha Cricket smusic ott India InsideEdgeOnPrime AmazonPrimeVideo PrimeVideo testcricket,"List(inside, edge, season, review, judos, dhamaka, youtube, link, insideedge, vivekoberoi, insideedge, farhanakhtar, insideedge, review, richachaddha, cricket, smusic, ott, india, insideedgeonprime, amazonprimevideo, primevideo, testcricket)","List(inside, edge, season, review, judos, dhamaka, youtube, link, insideedge, vivekoberoi, insideedge, farhanakhtar, insideedge, review, richachaddha, cricket, smusic, ott, india, insideedgeonprime, amazonprimevideo, primevideo, testcricket)","List(insid, edg, season, review, judo, dhamaka, youtub, link, insideedg, vivekoberoi, insideedg, farhanakhtar, insideedg, review, richachaddha, cricket, smusic, ott, india, insideedgeonprim, amazonprimevideo, primevideo, testcricket)"
#India is back to the No.1 spot in #ICC Men’s #Test Team Rankings #ymedia #cricket @ICC @BCCI @YudhvirJaswal https://t.co/df61PrryXH,2021-12-06T22:55:07.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,India is back to the No spot in ICC Men s Test Team Rankings ymedia cricket ICC BCCI YudhvirJaswal,"List(, india, is, back, to, the, no, spot, in, icc, men, s, test, team, rankings, ymedia, cricket, icc, bcci, yudhvirjaswal)","List(, india, back, spot, icc, men, test, team, rankings, ymedia, cricket, icc, bcci, yudhvirjaswal)","List(, india, back, spot, icc, men, test, team, rank, ymedia, cricket, icc, bcci, yudhvirjasw)"
CSA confirms revised India tour schedule. 1st test will be held Dec 26 https://t.co/U94tJTWB3n #BCCI #confirms #Cricket #ICC #India Read Full 👇,2021-12-06T21:45:03.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,CSA confirms revised India tour schedule st test will be held Dec BCCI confirms Cricket ICC India Read Full,"List(csa, confirms, revised, india, tour, schedule, st, test, will, be, held, dec, bcci, confirms, cricket, icc, india, read, full)","List(csa, confirms, revised, india, tour, schedule, st, test, held, dec, bcci, confirms, cricket, icc, india, read, full)","List(csa, confirm, revis, india, tour, schedul, st, test, held, dec, bcci, confirm, cricket, icc, india, read, full)"
Twitter Celebrities Reactions On India Test Series Win Against New Zealand 🏏🇮🇳🇳🇿 Watch - https://t.co/AyYRyJvGe7 #Cricket #India #INDvNZ #NZvIND #INDvsNZ #NZvsIND #ViratKohli #CricketTwitter #AjazPatel #BCCI #10wickets #Ashwin #AskStar #Ashes #BANvPAK #BabarAzam #BBL11 #Dhoni https://t.co/6W4xxgadCk,2021-12-06T19:45:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,Twitter Celebrities Reactions On India Test Series Win Against New Zealand Watch Cricket India INDvNZ NZvIND INDvsNZ NZvsIND ViratKohli CricketTwitter AjazPatel BCCI wickets Ashwin AskStar Ashes BANvPAK BabarAzam BBL Dhoni,"List(twitter, celebrities, reactions, on, india, test, series, win, against, new, zealand, watch, cricket, india, indvnz, nzvind, indvsnz, nzvsind, viratkohli, crickettwitter, ajazpatel, bcci, wickets, ashwin, askstar, ashes, banvpak, babarazam, bbl, dhoni)","List(twitter, celebrities, reactions, india, test, series, win, new, zealand, watch, cricket, india, indvnz, nzvind, indvsnz, nzvsind, viratkohli, crickettwitter, ajazpatel, bcci, wickets, ashwin, askstar, ashes, banvpak, babarazam, bbl, dhoni)","List(twitter, celebr, reaction, india, test, seri, win, new, zealand, watch, cricket, india, indvnz, nzvind, indvsnz, nzvsind, viratkohli, crickettwitt, ajazpatel, bcci, wicket, ashwin, askstar, ash, banvpak, babarazam, bbl, dhoni)"
Tests: 50 wins from 97 matches ODIs: 153 wins in 254 matches T20Is: 59 wins in 95 matches Virat Kohli - the first player in international cricket to secure 50 or more wins in each of the three formats 🔥🔥 #ViratKohli #India #INDvNZ #Cricket https://t.co/nGtemEqVCw,2021-12-06T18:30:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,Tests wins from matches ODIs wins in matches T Is wins in matches Virat Kohli the first player in international cricket to secure or more wins in each of the three formats ViratKohli India INDvNZ Cricket,"List(tests, wins, from, matches, odis, wins, in, matches, t, is, wins, in, matches, virat, kohli, the, first, player, in, international, cricket, to, secure, or, more, wins, in, each, of, the, three, formats, viratkohli, india, indvnz, cricket)","List(tests, wins, matches, odis, wins, matches, wins, matches, virat, kohli, first, player, international, cricket, secure, wins, three, formats, viratkohli, india, indvnz, cricket)","List(test, win, match, odi, win, match, win, match, virat, kohli, first, player, intern, cricket, secur, win, three, format, viratkohli, india, indvnz, cricket)"
Mumbai: Indian players came up with a brilliant gesture to honour New Zealand spinner Ajaz Patel after his historic ten-wicket haul in the second test. Read more below! #india #Mumbai #players #match #cricket #NewZealand #AjazPatel #OmicronVirus #vaccine #wicket #UniCreds https://t.co/ZHM70lgd80,2021-12-06T17:58:43.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,Mumbai Indian players came up with a brilliant gesture to honour New Zealand spinner Ajaz Patel after his historic ten wicket haul in the second test Read more below india Mumbai players match cricket NewZealand AjazPatel OmicronVirus vaccine wicket UniCreds,"List(mumbai, indian, players, came, up, with, a, brilliant, gesture, to, honour, new, zealand, spinner, ajaz, patel, after, his, historic, ten, wicket, haul, in, the, second, test, read, more, below, india, mumbai, players, match, cricket, newzealand, ajazpatel, omicronvirus, vaccine, wicket, unicreds)","List(mumbai, indian, players, came, brilliant, gesture, honour, new, zealand, spinner, ajaz, patel, historic, ten, wicket, haul, second, test, read, india, mumbai, players, match, cricket, newzealand, ajazpatel, omicronvirus, vaccine, wicket, unicreds)","List(mumbai, indian, player, came, brilliant, gestur, honour, new, zealand, spinner, ajaz, patel, histor, ten, wicket, haul, second, test, read, india, mumbai, player, match, cricket, newzealand, ajazpatel, omicronviru, vaccin, wicket, unicr)"
SA Vs IND: Schedule for India's tour of South Africa revised - @OfficialCSA @BCCI #SouthAfrica #India #Cricket #Sports #SAvsIND Check details- https://t.co/6NjtFWv5Ie,2021-12-06T17:30:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,SA Vs IND Schedule for Indias tour of South Africa revised OfficialCSA BCCI SouthAfrica India Cricket Sports SAvsIND Check details,"List(sa, vs, ind, schedule, for, indias, tour, of, south, africa, revised, officialcsa, bcci, southafrica, india, cricket, sports, savsind, check, details)","List(sa, vs, ind, schedule, indias, tour, south, africa, revised, officialcsa, bcci, southafrica, india, cricket, sports, savsind, check, details)","List(sa, vs, ind, schedul, india, tour, south, africa, revis, officialcsa, bcci, southafrica, india, cricket, sport, savsind, check, detail)"
CSA confirms revised India tour schedule. 1st test will be held Dec 26 https://t.co/MyxZGcU0VU #Bcci #Confirms #Cricket #India #Oppo,2021-12-06T17:20:32.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,CSA confirms revised India tour schedule st test will be held Dec Bcci Confirms Cricket India Oppo,"List(csa, confirms, revised, india, tour, schedule, st, test, will, be, held, dec, bcci, confirms, cricket, india, oppo)","List(csa, confirms, revised, india, tour, schedule, st, test, held, dec, bcci, confirms, cricket, india, oppo)","List(csa, confirm, revis, india, tour, schedul, st, test, held, dec, bcci, confirm, cricket, india, oppo)"
One step ahead to world championship #BCCI #Cricket #india 👍👍👍👍 https://t.co/17sKZjOFrb,2021-12-06T17:13:27.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,One step ahead to world championship BCCI Cricket india,"List(one, step, ahead, to, world, championship, bcci, cricket, india)","List(one, step, ahead, world, championship, bcci, cricket, india)","List(one, step, ahead, world, championship, bcci, cricket, india)"
#INDvsSA @OfficialCSA revise tour schedule ! @BCCI @ICC #IndvsSA #India #SouthAfrica #INDvsSATest #INDvsSATestSeries #INDvsSATest2021 #INDvsSASeries #Cricket https://t.co/8CWlvPnA2i,2021-12-06T16:37:55.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,INDvsSA OfficialCSA revise tour schedule BCCI ICC IndvsSA India SouthAfrica INDvsSATest INDvsSATestSeries INDvsSATest INDvsSASeries Cricket,"List(, indvssa, officialcsa, revise, tour, schedule, bcci, icc, indvssa, india, southafrica, indvssatest, indvssatestseries, indvssatest, indvssaseries, cricket)","List(, indvssa, officialcsa, revise, tour, schedule, bcci, icc, indvssa, india, southafrica, indvssatest, indvssatestseries, indvssatest, indvssaseries, cricket)","List(, indvssa, officialcsa, revis, tour, schedul, bcci, icc, indvssa, india, southafrica, indvssatest, indvssatestseri, indvssatest, indvssaseri, cricket)"


created_at,search_query,text,words_stem
2021-12-06T23:09:40.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,Inside Edge Season 3 Review | Judo's Dhamaka #Youtube #Link https://t.co/Ni9j1xJv48 #InsideEdge3 #VivekOberoi #InsideEdge #FarhanAkhtar #insideedge3review #richachaddha #Cricket #90smusic #ott #India #InsideEdgeOnPrime #AmazonPrimeVideo #PrimeVideo #testcricket https://t.co/nxlHVyWvn4,"List(insid, edg, season, review, judo, dhamaka, youtub, link, insideedg, vivekoberoi, insideedg, farhanakhtar, insideedg, review, richachaddha, cricket, smusic, ott, india, insideedgeonprim, amazonprimevideo, primevideo, testcricket)"
2021-12-06T22:55:07.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,#India is back to the No.1 spot in #ICC Men’s #Test Team Rankings #ymedia #cricket @ICC @BCCI @YudhvirJaswal https://t.co/df61PrryXH,"List(, india, back, spot, icc, men, test, team, rank, ymedia, cricket, icc, bcci, yudhvirjasw)"
2021-12-06T21:45:03.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,CSA confirms revised India tour schedule. 1st test will be held Dec 26 https://t.co/U94tJTWB3n #BCCI #confirms #Cricket #ICC #India Read Full 👇,"List(csa, confirm, revis, india, tour, schedul, st, test, held, dec, bcci, confirm, cricket, icc, india, read, full)"
2021-12-06T19:45:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,Twitter Celebrities Reactions On India Test Series Win Against New Zealand 🏏🇮🇳🇳🇿 Watch - https://t.co/AyYRyJvGe7 #Cricket #India #INDvNZ #NZvIND #INDvsNZ #NZvsIND #ViratKohli #CricketTwitter #AjazPatel #BCCI #10wickets #Ashwin #AskStar #Ashes #BANvPAK #BabarAzam #BBL11 #Dhoni https://t.co/6W4xxgadCk,"List(twitter, celebr, reaction, india, test, seri, win, new, zealand, watch, cricket, india, indvnz, nzvind, indvsnz, nzvsind, viratkohli, crickettwitt, ajazpatel, bcci, wicket, ashwin, askstar, ash, banvpak, babarazam, bbl, dhoni)"
2021-12-06T18:30:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,Tests: 50 wins from 97 matches ODIs: 153 wins in 254 matches T20Is: 59 wins in 95 matches Virat Kohli - the first player in international cricket to secure 50 or more wins in each of the three formats 🔥🔥 #ViratKohli #India #INDvNZ #Cricket https://t.co/nGtemEqVCw,"List(test, win, match, odi, win, match, win, match, virat, kohli, first, player, intern, cricket, secur, win, three, format, viratkohli, india, indvnz, cricket)"
2021-12-06T17:58:43.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,Mumbai: Indian players came up with a brilliant gesture to honour New Zealand spinner Ajaz Patel after his historic ten-wicket haul in the second test. Read more below! #india #Mumbai #players #match #cricket #NewZealand #AjazPatel #OmicronVirus #vaccine #wicket #UniCreds https://t.co/ZHM70lgd80,"List(mumbai, indian, player, came, brilliant, gestur, honour, new, zealand, spinner, ajaz, patel, histor, ten, wicket, haul, second, test, read, india, mumbai, player, match, cricket, newzealand, ajazpatel, omicronviru, vaccin, wicket, unicr)"
2021-12-06T17:30:00.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,SA Vs IND: Schedule for India's tour of South Africa revised - @OfficialCSA @BCCI #SouthAfrica #India #Cricket #Sports #SAvsIND Check details- https://t.co/6NjtFWv5Ie,"List(sa, vs, ind, schedul, india, tour, south, africa, revis, officialcsa, bcci, southafrica, india, cricket, sport, savsind, check, detail)"
2021-12-06T17:20:32.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,CSA confirms revised India tour schedule. 1st test will be held Dec 26 https://t.co/MyxZGcU0VU #Bcci #Confirms #Cricket #India #Oppo,"List(csa, confirm, revis, india, tour, schedul, st, test, held, dec, bcci, confirm, cricket, india, oppo)"
2021-12-06T17:13:27.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,One step ahead to world championship #BCCI #Cricket #india 👍👍👍👍 https://t.co/17sKZjOFrb,"List(one, step, ahead, world, championship, bcci, cricket, india)"
2021-12-06T16:37:55.000+0000,(#IND OR #INDIA) AND (#Cricket OR #Match OR #INDVSAUS) -filter:retweets,#INDvsSA @OfficialCSA revise tour schedule ! @BCCI @ICC #IndvsSA #India #SouthAfrica #INDvsSATest #INDvsSATestSeries #INDvsSATest2021 #INDvsSASeries #Cricket https://t.co/8CWlvPnA2i,"List(, indvssa, officialcsa, revis, tour, schedul, bcci, icc, indvssa, india, southafrica, indvssatest, indvssatestseri, indvssatest, indvssaseri, cricket)"


In [0]:
df2_pred = pipelineModel.transform(df2)
display(df2_pred.select("text", "prediction"))
df2_pred = df2_pred.select("text", col("prediction").cast(DoubleType()))

display(df2_pred.groupBy("prediction").count())

text,prediction
Inside Edge Season 3 Review | Judo's Dhamaka #Youtube #Link https://t.co/Ni9j1xJv48 #InsideEdge3 #VivekOberoi #InsideEdge #FarhanAkhtar #insideedge3review #richachaddha #Cricket #90smusic #ott #India #InsideEdgeOnPrime #AmazonPrimeVideo #PrimeVideo #testcricket https://t.co/nxlHVyWvn4,1.0
#India is back to the No.1 spot in #ICC Men’s #Test Team Rankings #ymedia #cricket @ICC @BCCI @YudhvirJaswal https://t.co/df61PrryXH,2.0
CSA confirms revised India tour schedule. 1st test will be held Dec 26 https://t.co/U94tJTWB3n #BCCI #confirms #Cricket #ICC #India Read Full 👇,1.0
Twitter Celebrities Reactions On India Test Series Win Against New Zealand 🏏🇮🇳🇳🇿 Watch - https://t.co/AyYRyJvGe7 #Cricket #India #INDvNZ #NZvIND #INDvsNZ #NZvsIND #ViratKohli #CricketTwitter #AjazPatel #BCCI #10wickets #Ashwin #AskStar #Ashes #BANvPAK #BabarAzam #BBL11 #Dhoni https://t.co/6W4xxgadCk,2.0
Tests: 50 wins from 97 matches ODIs: 153 wins in 254 matches T20Is: 59 wins in 95 matches Virat Kohli - the first player in international cricket to secure 50 or more wins in each of the three formats 🔥🔥 #ViratKohli #India #INDvNZ #Cricket https://t.co/nGtemEqVCw,2.0
Mumbai: Indian players came up with a brilliant gesture to honour New Zealand spinner Ajaz Patel after his historic ten-wicket haul in the second test. Read more below! #india #Mumbai #players #match #cricket #NewZealand #AjazPatel #OmicronVirus #vaccine #wicket #UniCreds https://t.co/ZHM70lgd80,2.0
SA Vs IND: Schedule for India's tour of South Africa revised - @OfficialCSA @BCCI #SouthAfrica #India #Cricket #Sports #SAvsIND Check details- https://t.co/6NjtFWv5Ie,1.0
CSA confirms revised India tour schedule. 1st test will be held Dec 26 https://t.co/MyxZGcU0VU #Bcci #Confirms #Cricket #India #Oppo,1.0
One step ahead to world championship #BCCI #Cricket #india 👍👍👍👍 https://t.co/17sKZjOFrb,2.0
#INDvsSA @OfficialCSA revise tour schedule ! @BCCI @ICC #IndvsSA #India #SouthAfrica #INDvsSATest #INDvsSATestSeries #INDvsSATest2021 #INDvsSASeries #Cricket https://t.co/8CWlvPnA2i,2.0


prediction,count
0.0,13
1.0,556
2.0,431


In [0]:
df3 = spark.read.option("wholeFile", True)\
                .option("multiline", True)\
                .option("header", True)\
                .option("escape","\"")\
                .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")\
                .csv("/FileStore/tables/Australia_cricket.csv")\


df3 = df3.dropna()
display(df3)

text,created_at,search_query
🏏 2021 #TheAshes #Cricket Series - First Test in #Australia 🇦🇺 #Brisbane start tomorrow @englandcricket @CricketAus #Ashes #Ashes2021 https://t.co/i8g5rTK1Z3,2021-12-07T00:20:00.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
No Jimmy means we can enjoy this... #TheAshes #Cricket #Australia #ENGLAND https://t.co/hcn1Zrc9sx,2021-12-06T23:37:39.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
Australia vs England - Ashes 2021/22 Betting Bet on BetWinner: https://t.co/8J3w0LQ1zL  #SportsBetting #Betting #Cricket #Australia #AUS #England #ENG #Ashes #Ashes2021 #TheGabba #Gabba #AUSvENG #AUSvsENG #BetWinner #Follow https://t.co/KyiTMDgqUP,2021-12-06T19:54:29.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
Fresh from a victory in the world coin tossing championship #Australia will fancy their chances at home in #TheAshes Of course much will depend on TV lenses keeping a distance and just how much sandpaper can usefully be deployed - gritty stuff #Cricket #tossers #AUSvENG,2021-12-06T19:27:40.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
This Is a sight England fans will be familiar with! #Ashes #Ashes2021 #CricketTwitter #cricket #england #englandcricket #Australia #cricketaustralia #joeroot #gabba #engvaus https://t.co/xq6lxthkJm,2021-12-06T16:39:03.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
It’s Ashes week! The wait Is almost over. 🏴󠁧󠁢󠁥󠁮󠁧󠁿 🇦🇺 #Ashes #Ashes2021 #ENGLAND #englandcricket #Cricket #Australia https://t.co/rr0Yg9tyWi,2021-12-06T16:25:27.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
Ashes 2021-22: Perth no more to host final Test Which stadium will host the last test🤔See here👇 #Ashes #England #Australia #Cricket #TestCricket #TEst #Pakistan #IndvsNZ #PakvsBan #PSL #IPL https://t.co/0jKRgnGN4c,2021-12-06T16:19:57.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
Ashes 2021-22: England’s captain opens up on Ben Stokes’ return 😳 Read more👇 #Ashes #England #Australia #Cricket #PSL #IPL #BenStokes https://t.co/qtiFHlOTn1,2021-12-06T15:45:21.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
Golden Opportunity for Big Savings with Golden Ticket. Buy your Golden Ticket for Australia T20 League Now Buy your Ticket Now & Save Big! #Australia #AustraliaT20League #T20League #T20Cricket #Discounts #GoldenTickets #MegaDiscount #Cricket #abpooraIndiaKhelega #MyTeam11 https://t.co/h4CAxpaBBO,2021-12-06T14:01:59.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets
Who's excited for the Ashes? @ICCMediaComms @ESPNcricinfo @CricketAus @cricketcomau @ECB_cricket #Cricket #Ashes #ausvseng #Australia #ENGLAND #joeroot #stevesmith #Warner #anderson #gabba,2021-12-06T13:26:35.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets


In [0]:
# remove links
df3 = df3.withColumn('text_cleaned', regexp_replace(df3.text, r'http\S+', ''))

# remove all characters except alphabetic ones
# replace ' with nothing to make sure contractions are not split
df3 = df3.withColumn('text_cleaned', regexp_replace(df3.text_cleaned, "\'", ''))
df3 = df3.withColumn('text_cleaned', regexp_replace(df3.text_cleaned, '[^a-zA-Z\s]', ' '))

# group whitespace
df3 = df3.withColumn('text_cleaned', regexp_replace(df3.text_cleaned, '\s+', ' '))

# tokenize the text into words
df3 = Tokenizer(inputCol='text_cleaned', outputCol='words').transform(df3)

# remove stopwords and 'rt' (rt is twitter lingo for retweet, has no imapct on text sentiment)
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.append("rt")
udf_remove_stop = udf(lambda x: [i for i in x if not i.lower() in stop_words], ArrayType(StringType()))
df3 = df3.withColumn("words_cleaned", udf_remove_stop("words"))

# convert words to stems
stemmer = PorterStemmer()
udf_stem = udf(lambda x: [stemmer.stem(i) for i in x], ArrayType(StringType()))
df3 = df3.withColumn("words_stem", udf_stem("words_cleaned"))


display(df3)

df3 = df3.select("created_at", "search_query", "text", "words_stem")

display(df3)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


text,created_at,search_query,text_cleaned,words,words_cleaned,words_stem
🏏 2021 #TheAshes #Cricket Series - First Test in #Australia 🇦🇺 #Brisbane start tomorrow @englandcricket @CricketAus #Ashes #Ashes2021 https://t.co/i8g5rTK1Z3,2021-12-07T00:20:00.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,TheAshes Cricket Series First Test in Australia Brisbane start tomorrow englandcricket CricketAus Ashes Ashes,"List(, theashes, cricket, series, first, test, in, australia, brisbane, start, tomorrow, englandcricket, cricketaus, ashes, ashes)","List(, theashes, cricket, series, first, test, australia, brisbane, start, tomorrow, englandcricket, cricketaus, ashes, ashes)","List(, theash, cricket, seri, first, test, australia, brisban, start, tomorrow, englandcricket, cricketau, ash, ash)"
No Jimmy means we can enjoy this... #TheAshes #Cricket #Australia #ENGLAND https://t.co/hcn1Zrc9sx,2021-12-06T23:37:39.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,No Jimmy means we can enjoy this TheAshes Cricket Australia ENGLAND,"List(no, jimmy, means, we, can, enjoy, this, theashes, cricket, australia, england)","List(jimmy, means, enjoy, theashes, cricket, australia, england)","List(jimmi, mean, enjoy, theash, cricket, australia, england)"
Australia vs England - Ashes 2021/22 Betting Bet on BetWinner: https://t.co/8J3w0LQ1zL  #SportsBetting #Betting #Cricket #Australia #AUS #England #ENG #Ashes #Ashes2021 #TheGabba #Gabba #AUSvENG #AUSvsENG #BetWinner #Follow https://t.co/KyiTMDgqUP,2021-12-06T19:54:29.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Australia vs England Ashes Betting Bet on BetWinner SportsBetting Betting Cricket Australia AUS England ENG Ashes Ashes TheGabba Gabba AUSvENG AUSvsENG BetWinner Follow,"List(australia, vs, england, ashes, betting, bet, on, betwinner, sportsbetting, betting, cricket, australia, aus, england, eng, ashes, ashes, thegabba, gabba, ausveng, ausvseng, betwinner, follow)","List(australia, vs, england, ashes, betting, bet, betwinner, sportsbetting, betting, cricket, australia, aus, england, eng, ashes, ashes, thegabba, gabba, ausveng, ausvseng, betwinner, follow)","List(australia, vs, england, ash, bet, bet, betwinn, sportsbet, bet, cricket, australia, au, england, eng, ash, ash, thegabba, gabba, ausveng, ausvseng, betwinn, follow)"
Fresh from a victory in the world coin tossing championship #Australia will fancy their chances at home in #TheAshes Of course much will depend on TV lenses keeping a distance and just how much sandpaper can usefully be deployed - gritty stuff #Cricket #tossers #AUSvENG,2021-12-06T19:27:40.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Fresh from a victory in the world coin tossing championship Australia will fancy their chances at home in TheAshes Of course much will depend on TV lenses keeping a distance and just how much sandpaper can usefully be deployed gritty stuff Cricket tossers AUSvENG,"List(fresh, from, a, victory, in, the, world, coin, tossing, championship, australia, will, fancy, their, chances, at, home, in, theashes, of, course, much, will, depend, on, tv, lenses, keeping, a, distance, and, just, how, much, sandpaper, can, usefully, be, deployed, gritty, stuff, cricket, tossers, ausveng)","List(fresh, victory, world, coin, tossing, championship, australia, fancy, chances, home, theashes, course, much, depend, tv, lenses, keeping, distance, much, sandpaper, usefully, deployed, gritty, stuff, cricket, tossers, ausveng)","List(fresh, victori, world, coin, toss, championship, australia, fanci, chanc, home, theash, cours, much, depend, tv, lens, keep, distanc, much, sandpap, use, deploy, gritti, stuff, cricket, tosser, ausveng)"
This Is a sight England fans will be familiar with! #Ashes #Ashes2021 #CricketTwitter #cricket #england #englandcricket #Australia #cricketaustralia #joeroot #gabba #engvaus https://t.co/xq6lxthkJm,2021-12-06T16:39:03.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,This Is a sight England fans will be familiar with Ashes Ashes CricketTwitter cricket england englandcricket Australia cricketaustralia joeroot gabba engvaus,"List(this, is, a, sight, england, fans, will, be, familiar, with, ashes, ashes, crickettwitter, cricket, england, englandcricket, australia, cricketaustralia, joeroot, gabba, engvaus)","List(sight, england, fans, familiar, ashes, ashes, crickettwitter, cricket, england, englandcricket, australia, cricketaustralia, joeroot, gabba, engvaus)","List(sight, england, fan, familiar, ash, ash, crickettwitt, cricket, england, englandcricket, australia, cricketaustralia, joeroot, gabba, engvau)"
It’s Ashes week! The wait Is almost over. 🏴󠁧󠁢󠁥󠁮󠁧󠁿 🇦🇺 #Ashes #Ashes2021 #ENGLAND #englandcricket #Cricket #Australia https://t.co/rr0Yg9tyWi,2021-12-06T16:25:27.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,It s Ashes week The wait Is almost over Ashes Ashes ENGLAND englandcricket Cricket Australia,"List(it, s, ashes, week, the, wait, is, almost, over, ashes, ashes, england, englandcricket, cricket, australia)","List(ashes, week, wait, almost, ashes, ashes, england, englandcricket, cricket, australia)","List(ash, week, wait, almost, ash, ash, england, englandcricket, cricket, australia)"
Ashes 2021-22: Perth no more to host final Test Which stadium will host the last test🤔See here👇 #Ashes #England #Australia #Cricket #TestCricket #TEst #Pakistan #IndvsNZ #PakvsBan #PSL #IPL https://t.co/0jKRgnGN4c,2021-12-06T16:19:57.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Ashes Perth no more to host final Test Which stadium will host the last test See here Ashes England Australia Cricket TestCricket TEst Pakistan IndvsNZ PakvsBan PSL IPL,"List(ashes, perth, no, more, to, host, final, test, which, stadium, will, host, the, last, test, see, here, ashes, england, australia, cricket, testcricket, test, pakistan, indvsnz, pakvsban, psl, ipl)","List(ashes, perth, host, final, test, stadium, host, last, test, see, ashes, england, australia, cricket, testcricket, test, pakistan, indvsnz, pakvsban, psl, ipl)","List(ash, perth, host, final, test, stadium, host, last, test, see, ash, england, australia, cricket, testcricket, test, pakistan, indvsnz, pakvsban, psl, ipl)"
Ashes 2021-22: England’s captain opens up on Ben Stokes’ return 😳 Read more👇 #Ashes #England #Australia #Cricket #PSL #IPL #BenStokes https://t.co/qtiFHlOTn1,2021-12-06T15:45:21.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Ashes England s captain opens up on Ben Stokes return Read more Ashes England Australia Cricket PSL IPL BenStokes,"List(ashes, england, s, captain, opens, up, on, ben, stokes, return, read, more, ashes, england, australia, cricket, psl, ipl, benstokes)","List(ashes, england, captain, opens, ben, stokes, return, read, ashes, england, australia, cricket, psl, ipl, benstokes)","List(ash, england, captain, open, ben, stoke, return, read, ash, england, australia, cricket, psl, ipl, benstok)"
Golden Opportunity for Big Savings with Golden Ticket. Buy your Golden Ticket for Australia T20 League Now Buy your Ticket Now & Save Big! #Australia #AustraliaT20League #T20League #T20Cricket #Discounts #GoldenTickets #MegaDiscount #Cricket #abpooraIndiaKhelega #MyTeam11 https://t.co/h4CAxpaBBO,2021-12-06T14:01:59.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Golden Opportunity for Big Savings with Golden Ticket Buy your Golden Ticket for Australia T League Now Buy your Ticket Now amp Save Big Australia AustraliaT League T League T Cricket Discounts GoldenTickets MegaDiscount Cricket abpooraIndiaKhelega MyTeam,"List(golden, opportunity, for, big, savings, with, golden, ticket, buy, your, golden, ticket, for, australia, t, league, now, buy, your, ticket, now, amp, save, big, australia, australiat, league, t, league, t, cricket, discounts, goldentickets, megadiscount, cricket, abpooraindiakhelega, myteam)","List(golden, opportunity, big, savings, golden, ticket, buy, golden, ticket, australia, league, buy, ticket, amp, save, big, australia, australiat, league, league, cricket, discounts, goldentickets, megadiscount, cricket, abpooraindiakhelega, myteam)","List(golden, opportun, big, save, golden, ticket, buy, golden, ticket, australia, leagu, buy, ticket, amp, save, big, australia, australiat, leagu, leagu, cricket, discount, goldenticket, megadiscount, cricket, abpooraindiakhelega, myteam)"
Who's excited for the Ashes? @ICCMediaComms @ESPNcricinfo @CricketAus @cricketcomau @ECB_cricket #Cricket #Ashes #ausvseng #Australia #ENGLAND #joeroot #stevesmith #Warner #anderson #gabba,2021-12-06T13:26:35.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Whos excited for the Ashes ICCMediaComms ESPNcricinfo CricketAus cricketcomau ECB cricket Cricket Ashes ausvseng Australia ENGLAND joeroot stevesmith Warner anderson gabba,"List(whos, excited, for, the, ashes, iccmediacomms, espncricinfo, cricketaus, cricketcomau, ecb, cricket, cricket, ashes, ausvseng, australia, england, joeroot, stevesmith, warner, anderson, gabba)","List(whos, excited, ashes, iccmediacomms, espncricinfo, cricketaus, cricketcomau, ecb, cricket, cricket, ashes, ausvseng, australia, england, joeroot, stevesmith, warner, anderson, gabba)","List(who, excit, ash, iccmediacomm, espncricinfo, cricketau, cricketcomau, ecb, cricket, cricket, ash, ausvseng, australia, england, joeroot, stevesmith, warner, anderson, gabba)"


created_at,search_query,text,words_stem
2021-12-07T00:20:00.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,🏏 2021 #TheAshes #Cricket Series - First Test in #Australia 🇦🇺 #Brisbane start tomorrow @englandcricket @CricketAus #Ashes #Ashes2021 https://t.co/i8g5rTK1Z3,"List(, theash, cricket, seri, first, test, australia, brisban, start, tomorrow, englandcricket, cricketau, ash, ash)"
2021-12-06T23:37:39.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,No Jimmy means we can enjoy this... #TheAshes #Cricket #Australia #ENGLAND https://t.co/hcn1Zrc9sx,"List(jimmi, mean, enjoy, theash, cricket, australia, england)"
2021-12-06T19:54:29.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Australia vs England - Ashes 2021/22 Betting Bet on BetWinner: https://t.co/8J3w0LQ1zL  #SportsBetting #Betting #Cricket #Australia #AUS #England #ENG #Ashes #Ashes2021 #TheGabba #Gabba #AUSvENG #AUSvsENG #BetWinner #Follow https://t.co/KyiTMDgqUP,"List(australia, vs, england, ash, bet, bet, betwinn, sportsbet, bet, cricket, australia, au, england, eng, ash, ash, thegabba, gabba, ausveng, ausvseng, betwinn, follow)"
2021-12-06T19:27:40.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Fresh from a victory in the world coin tossing championship #Australia will fancy their chances at home in #TheAshes Of course much will depend on TV lenses keeping a distance and just how much sandpaper can usefully be deployed - gritty stuff #Cricket #tossers #AUSvENG,"List(fresh, victori, world, coin, toss, championship, australia, fanci, chanc, home, theash, cours, much, depend, tv, lens, keep, distanc, much, sandpap, use, deploy, gritti, stuff, cricket, tosser, ausveng)"
2021-12-06T16:39:03.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,This Is a sight England fans will be familiar with! #Ashes #Ashes2021 #CricketTwitter #cricket #england #englandcricket #Australia #cricketaustralia #joeroot #gabba #engvaus https://t.co/xq6lxthkJm,"List(sight, england, fan, familiar, ash, ash, crickettwitt, cricket, england, englandcricket, australia, cricketaustralia, joeroot, gabba, engvau)"
2021-12-06T16:25:27.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,It’s Ashes week! The wait Is almost over. 🏴󠁧󠁢󠁥󠁮󠁧󠁿 🇦🇺 #Ashes #Ashes2021 #ENGLAND #englandcricket #Cricket #Australia https://t.co/rr0Yg9tyWi,"List(ash, week, wait, almost, ash, ash, england, englandcricket, cricket, australia)"
2021-12-06T16:19:57.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Ashes 2021-22: Perth no more to host final Test Which stadium will host the last test🤔See here👇 #Ashes #England #Australia #Cricket #TestCricket #TEst #Pakistan #IndvsNZ #PakvsBan #PSL #IPL https://t.co/0jKRgnGN4c,"List(ash, perth, host, final, test, stadium, host, last, test, see, ash, england, australia, cricket, testcricket, test, pakistan, indvsnz, pakvsban, psl, ipl)"
2021-12-06T15:45:21.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Ashes 2021-22: England’s captain opens up on Ben Stokes’ return 😳 Read more👇 #Ashes #England #Australia #Cricket #PSL #IPL #BenStokes https://t.co/qtiFHlOTn1,"List(ash, england, captain, open, ben, stoke, return, read, ash, england, australia, cricket, psl, ipl, benstok)"
2021-12-06T14:01:59.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Golden Opportunity for Big Savings with Golden Ticket. Buy your Golden Ticket for Australia T20 League Now Buy your Ticket Now & Save Big! #Australia #AustraliaT20League #T20League #T20Cricket #Discounts #GoldenTickets #MegaDiscount #Cricket #abpooraIndiaKhelega #MyTeam11 https://t.co/h4CAxpaBBO,"List(golden, opportun, big, save, golden, ticket, buy, golden, ticket, australia, leagu, buy, ticket, amp, save, big, australia, australiat, leagu, leagu, cricket, discount, goldenticket, megadiscount, cricket, abpooraindiakhelega, myteam)"
2021-12-06T13:26:35.000+0000,(#Australia) AND (#Cricket OR #Match OR #AUSVSIND) -filter:retweets,Who's excited for the Ashes? @ICCMediaComms @ESPNcricinfo @CricketAus @cricketcomau @ECB_cricket #Cricket #Ashes #ausvseng #Australia #ENGLAND #joeroot #stevesmith #Warner #anderson #gabba,"List(who, excit, ash, iccmediacomm, espncricinfo, cricketau, cricketcomau, ecb, cricket, cricket, ash, ausvseng, australia, england, joeroot, stevesmith, warner, anderson, gabba)"


In [0]:
df3_pred = pipelineModel.transform(df3)
display(df3_pred.select("text", "prediction"))
df3_pred = df3_pred.select("text", col("prediction").cast(DoubleType()))

display(df3_pred.groupBy("prediction").count())

text,prediction
🏏 2021 #TheAshes #Cricket Series - First Test in #Australia 🇦🇺 #Brisbane start tomorrow @englandcricket @CricketAus #Ashes #Ashes2021 https://t.co/i8g5rTK1Z3,1.0
No Jimmy means we can enjoy this... #TheAshes #Cricket #Australia #ENGLAND https://t.co/hcn1Zrc9sx,2.0
Australia vs England - Ashes 2021/22 Betting Bet on BetWinner: https://t.co/8J3w0LQ1zL  #SportsBetting #Betting #Cricket #Australia #AUS #England #ENG #Ashes #Ashes2021 #TheGabba #Gabba #AUSvENG #AUSvsENG #BetWinner #Follow https://t.co/KyiTMDgqUP,1.0
Fresh from a victory in the world coin tossing championship #Australia will fancy their chances at home in #TheAshes Of course much will depend on TV lenses keeping a distance and just how much sandpaper can usefully be deployed - gritty stuff #Cricket #tossers #AUSvENG,2.0
This Is a sight England fans will be familiar with! #Ashes #Ashes2021 #CricketTwitter #cricket #england #englandcricket #Australia #cricketaustralia #joeroot #gabba #engvaus https://t.co/xq6lxthkJm,1.0
It’s Ashes week! The wait Is almost over. 🏴󠁧󠁢󠁥󠁮󠁧󠁿 🇦🇺 #Ashes #Ashes2021 #ENGLAND #englandcricket #Cricket #Australia https://t.co/rr0Yg9tyWi,1.0
Ashes 2021-22: Perth no more to host final Test Which stadium will host the last test🤔See here👇 #Ashes #England #Australia #Cricket #TestCricket #TEst #Pakistan #IndvsNZ #PakvsBan #PSL #IPL https://t.co/0jKRgnGN4c,1.0
Ashes 2021-22: England’s captain opens up on Ben Stokes’ return 😳 Read more👇 #Ashes #England #Australia #Cricket #PSL #IPL #BenStokes https://t.co/qtiFHlOTn1,1.0
Golden Opportunity for Big Savings with Golden Ticket. Buy your Golden Ticket for Australia T20 League Now Buy your Ticket Now & Save Big! #Australia #AustraliaT20League #T20League #T20Cricket #Discounts #GoldenTickets #MegaDiscount #Cricket #abpooraIndiaKhelega #MyTeam11 https://t.co/h4CAxpaBBO,1.0
Who's excited for the Ashes? @ICCMediaComms @ESPNcricinfo @CricketAus @cricketcomau @ECB_cricket #Cricket #Ashes #ausvseng #Australia #ENGLAND #joeroot #stevesmith #Warner #anderson #gabba,1.0


prediction,count
0.0,9
1.0,85
2.0,77


In [0]:
## Australia's polarity prediction bar graph##

In [0]:
display(df3_pred.groupBy("prediction").count())

prediction,count
0.0,9
1.0,85
2.0,77


In [0]:
## India's polarity prediction bar graph##

In [0]:
display(df2_pred.groupBy("prediction").count())

prediction,count
0.0,13
1.0,556
2.0,431
