### Big Data Project
### Twitter Sentiment Analysis
by 
* Maryjane Tela
* Joseph Sabaybay

####Table of Contents

1. Import Libraries
2. Load Data
3. Preprocessing
4. Getting Labels
5. Feature Transformer: Tokenizer
6. Feature Transformer: Stopword Removal
7. Feature Transformer: CounterVectorizer(TF - Term Frequency)
8. Feature Transformer: TF-IDF Vectorization
9. Label Encoder
10. Machine Learning

#### 1. Import Libraries

In [0]:
!pip install textblob

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
import pandas as pd
import sklearn
from textblob import TextBlob
from datetime import datetime
import pytz

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
import pyspark.sql.functions as F
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import NGram, VectorAssembler, StopWordsRemover, HashingTF, IDF, Tokenizer, StringIndexer, CountVectorizer, ChiSqSelector, VectorAssembler
from pyspark.ml import Pipeline


#### 2. Load Data

In [0]:
def mount_s3_bucket(access_key, secret_key, bucket_name, mount_folder):
  ACCESS_KEY_ID = access_key
  SECRET_ACCESS_KEY = secret_key
  ENCODED_SECRET_KEY = SECRET_ACCESS_KEY.replace("/", "%2F")

  print ("Mounting", bucket_name)

  try:
    # Unmount the data in case it was already mounted.
    dbutils.fs.unmount("/mnt/%s" % mount_folder)
    
  except:
    # If it fails to unmount it most likely wasn't mounted in the first place
    print ("Directory not unmounted: ", mount_folder)
    
  finally:
    # Lastly, mount our bucket.
    dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY_ID, ENCODED_SECRET_KEY, bucket_name), "/mnt/%s" % mount_folder)
    #dbutils.fs.mount("s3a://"+ ACCESS_KEY_ID + ":" + ENCODED_SECRET_KEY + "@" + bucket_name, mount_folder)
    print ("The bucket", bucket_name, "was mounted to", mount_folder, "\n")
    

In [0]:
# Set AWS programmatic access credentials
ACCESS_KEY = "AKIAUPGDD2VT3R******"
SECRET_ACCESS_KEY = "DA3lbPqKSqpEiKKDLIoxDbg6bcLOQ8**********"

In [0]:
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, "weclouddata/twitter/BlackFriday", "BlackFriday")

Mounting weclouddata/twitter/BlackFriday
/mnt/BlackFriday has been unmounted.
The bucket weclouddata/twitter/BlackFriday was mounted to BlackFriday 



In [0]:
%fs ls /mnt/BlackFriday

path,name,size,modificationTime
dbfs:/mnt/BlackFriday/2022/,2022/,0,0


In [0]:
path = 'mnt/topics/BlackFriday/*/*/*/*/*'

In [0]:
# Start Spark session
spark = SparkSession \
        .builder \
        .appName('big_data_project') \
        .getOrCreate()
print('Session created')

sc = spark.sparkContext

Session created


In [0]:
# Create schema
schema = StructType([
    StructField('id', StringType(), True),
    StructField('name', StringType(), True),
    StructField('screen_name', StringType(), True),
    StructField('tweet', StringType(), True),
    StructField('followers_count', IntegerType(), True),
    StructField('location', StringType(), True),
    StructField('geo', StringType(), True),
    StructField('created_at', StringType(), True)
])

In [0]:
# Read the dataframe
df = (spark
     .read
     .option('header', 'false')
     .option('delimiter','\t')
     .schema(schema)
     .csv(path))

In [0]:
# Cache the dataframe for faster iteration
df.cache() 

# Run the count action to materialize the cache
df.count()

Out[79]: 2099030

In [0]:
display(df.take(20))

id,name,screen_name,tweet,followers_count,location,geo,created_at
1596186754348122112,Miabird 🧋,Miabird24,RT @JohnFugelsang: Black Friday - when the .1% tells the 99% to go save the economy they've been looting for the last 11 months.,162,"Martinsburg, WV",,Fri Nov 25 16:59:12 +0000 2022
1596186754620751872,Albert Owusu,AlbertO76240252,RT @Arsenal: 📣 BLACK FRIDAY 🛒 Get up to 50% off on Arsenal Direct 👇,11,,,Fri Nov 25 16:59:12 +0000 2022
1596186754305781761,Barry Masterson,BarryMasterson,"Some grounds around Dublin town. No black Friday sales from me this year, I'm no Jeff Bezos lads, but thank you to… https://t.co/d3RKo7NsMd",5133,Ireland/Internet,,Fri Nov 25 16:59:12 +0000 2022
1596186755702587392,USER117,USER11715,"RT @EndymionVA: YOU WANT TO GET WET FOR THE HOLIDAYS??? Black Friday sale going on NOW! A MEGA 30% OFF ALL WORK!!! Sale ends on Sunday,…",116,,,Fri Nov 25 16:59:12 +0000 2022
1596186755765796865,P1ckm3#6️⃣5️⃣🎹,W1ns4m3,RT @angel_funsized: $50 | 24 hours RT + Follow @CryptoCoinCoach & @NeblioTeam (BE ACTIVE ON PROFILE) Tweet on Timeline: #NEBL NEXT GEM O…,99,,,Fri Nov 25 16:59:12 +0000 2022
1596186755962925058,Harley Jean Matta,HarleyMatta,@jeremyduda Saw the video of the ‘protest.’ We had more people at our house yesterday than is present at the AZ Cap… https://t.co/IqlAmebiK7,242,Arizona,,Fri Nov 25 16:59:12 +0000 2022
1596186756290076672,Thе М̶ȯ̶̶ȯ̶ṉ̲ᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠ,Nomoreboyzz,RT @RhyheimX: 🖤Black Friday🖤 @phatrabbitkill2 @RhyheimX 🎥@AlternativaX_ https://t.co/mEx5IDo35G,88,Myanmar,,Fri Nov 25 16:59:13 +0000 2022
1596186756382089216,Missouri Workers Center,moworkerscenter,"RT @AfricaNowOnline: On Black Friday, #AmazonWorkers in 40+ Countries Strike and Protest 'Despicable' Treatment https://t.co/16avT8lnrt via…",450,"St Louis, MO",,Fri Nov 25 16:59:13 +0000 2022
1596186755379924992,"MagickalMoonDesigns, LLC",shopmagickal,"@ellinainthesky Hi I’m Toni💖 I graphic design tshirts, sweatshirts and also make custom designs😈 My whole shop is… https://t.co/obGvogYGNQ",3949,"Valley Center, CA",,Fri Nov 25 16:59:12 +0000 2022
1596186756520546304,azariel🪶,azarielwrites,RT @freydis_moon: BLACK FRIDAY SALE— ✨ Paperbacks & Stickers: 15% off ✨ Poetry & Tarot: 10% off 📚: https://t.co/U4IMz1wjH0 https://t.co/p…,294,,,Fri Nov 25 16:59:13 +0000 2022


In [0]:
# Mount to my bucket
mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, 'b16-mtela/big_data/BlackFriday','my_bucket')

Mounting b16-mtela/big_data/BlackFriday
/mnt/my_bucket has been unmounted.
The bucket b16-mtela/big_data/BlackFriday was mounted to my_bucket 



In [0]:
# Save csv file
(df
.write
.option('header','false')
.option('delimiter','\t')
.mode("overwrite")               # Replace existing files
.csv('/mnt/my_bucket/BlackFriday.csv'))

#### 3. Preprocessing

* Cleaning the 'tweet' column by 
1. Removing URLs
2. Removing special characters
3. Substituting multiple spaces with single space
4. Changing all text to lowercase
5. Trimming the leading/trailing whitespaces

In [0]:
df_clean = df.withColumn('tweet', F.regexp_replace('tweet', r"http\S+", "")) \
                    .withColumn('tweet', F.regexp_replace('tweet', r"[^a-zA-Z]", " ")) \
                    .withColumn('tweet', F.regexp_replace('tweet', r"\s+", " ")) \
                    .withColumn('tweet', F.lower('tweet')) \
                    .withColumn('tweet', F.trim('tweet')) 
display(df_clean.take(20))

id,name,screen_name,tweet,followers_count,location,geo,created_at
1596186754348122112,Miabird 🧋,Miabird24,rt johnfugelsang black friday when the tells the to go save the economy they ve been looting for the last months,162,"Martinsburg, WV",,Fri Nov 25 16:59:12 +0000 2022
1596186754620751872,Albert Owusu,AlbertO76240252,rt arsenal black friday get up to off on arsenal direct,11,,,Fri Nov 25 16:59:12 +0000 2022
1596186754305781761,Barry Masterson,BarryMasterson,some grounds around dublin town no black friday sales from me this year i m no jeff bezos lads but thank you to,5133,Ireland/Internet,,Fri Nov 25 16:59:12 +0000 2022
1596186755702587392,USER117,USER11715,rt endymionva you want to get wet for the holidays black friday sale going on now a mega off all work sale ends on sunday,116,,,Fri Nov 25 16:59:12 +0000 2022
1596186755765796865,P1ckm3#6️⃣5️⃣🎹,W1ns4m3,rt angel funsized hours rt follow cryptocoincoach amp neblioteam be active on profile tweet on timeline nebl next gem o,99,,,Fri Nov 25 16:59:12 +0000 2022
1596186755962925058,Harley Jean Matta,HarleyMatta,jeremyduda saw the video of the protest we had more people at our house yesterday than is present at the az cap,242,Arizona,,Fri Nov 25 16:59:12 +0000 2022
1596186756290076672,Thе М̶ȯ̶̶ȯ̶ṉ̲ᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠ,Nomoreboyzz,rt rhyheimx black friday phatrabbitkill rhyheimx alternativax,88,Myanmar,,Fri Nov 25 16:59:13 +0000 2022
1596186756382089216,Missouri Workers Center,moworkerscenter,rt africanowonline on black friday amazonworkers in countries strike and protest despicable treatment via,450,"St Louis, MO",,Fri Nov 25 16:59:13 +0000 2022
1596186755379924992,"MagickalMoonDesigns, LLC",shopmagickal,ellinainthesky hi i m toni i graphic design tshirts sweatshirts and also make custom designs my whole shop is,3949,"Valley Center, CA",,Fri Nov 25 16:59:12 +0000 2022
1596186756520546304,azariel🪶,azarielwrites,rt freydis moon black friday sale paperbacks amp stickers off poetry amp tarot off,294,,,Fri Nov 25 16:59:13 +0000 2022


In [0]:
# Check for missing values
df_clean.select([F.count(F.when(F.isnan(c),c)).alias(c) for c in df_clean.columns]).toPandas().head()

Unnamed: 0,id,name,screen_name,tweet,followers_count,location,geo,created_at
0,0,24,0,0,0,1,0,0


In [0]:
# Drop rows with null values
df_drop = df_clean.na.drop(how='any')

In [0]:
# Get the count after dropping empty rows
df_drop.count()

Out[86]: 2091006

In [0]:
# Check, drop duplicates and get the count
df_duplicate = df_drop.dropDuplicates()
df_duplicate.count()

Out[87]: 2090963

Further data cleaning in 'created_at' column for data visualization.

In [0]:
# Remove special character in 'created_at' column
# df_duplicate.withColumn('created_at', F.regexp_replace('created_at', r"[^a-zA-Z]", " "))

Out[88]: DataFrame[id: string, name: string, screen_name: string, tweet: string, followers_count: int, location: string, geo: string, created_at: string]

In [0]:
# # Convert to date string format
# def getDate(x):
#     if x is not None:
#         return str(datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
#     else:
#         return None

# # UDF declaration
# date_fn = udf(getDate, StringType())

# # Converting datatype in spark dataframe
# df_date = df_duplicate.withColumn("created_at", F.to_utc_timestamp(date_fn("created_at"),"UTC")) 

In [0]:
# Drop rows with null values
# df_drop = df_date.na.drop(how='any')

In [0]:
# display(df_date.take(20))

id,name,screen_name,tweet,followers_count,location,geo,created_at
1596186755379924992,"MagickalMoonDesigns, LLC",shopmagickal,ellinainthesky hi i m toni i graphic design tshirts sweatshirts and also make custom designs my whole shop is,3949,"Valley Center, CA",,2022-11-25T16:59:12.000+0000
1596186757598707712,Mohanzm,_Mohanzm,rt vocabularycom funfriday word caboodle on this black friday are you shopping for a caboodle of presents why not learn a caboodle of,62,"Puchong, Selangor",,2022-11-25T16:59:13.000+0000
1596186756290076672,Thе М̶ȯ̶̶ȯ̶ṉ̲ᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠᅠ,Nomoreboyzz,rt rhyheimx black friday phatrabbitkill rhyheimx alternativax,88,Myanmar,,2022-11-25T16:59:13.000+0000
1596186756507873280,🙉,dahyunstanacc,just bought my new pc ty black friday,309,he/him 25,,2022-11-25T16:59:13.000+0000
1596186756785000448,Kingy,EFCKingy,currysblackfriyay yes please,75,Merseyside,,2022-11-25T16:59:13.000+0000
1596186756520546304,azariel🪶,azarielwrites,rt freydis moon black friday sale paperbacks amp stickers off poetry amp tarot off,294,,,2022-11-25T16:59:13.000+0000
1596186757040508929,𝙽𝚊'𝙸𝚖𝚊𝚑🫚,fedolive,rt tarotbybronx happy black friday my shop is restocked with abundance love oils and more for a human design,511,cali,,2022-11-25T16:59:13.000+0000
1596186754620751872,Albert Owusu,AlbertO76240252,rt arsenal black friday get up to off on arsenal direct,11,,,2022-11-25T16:59:12.000+0000
1596186757191581696,Grubb,JeffGrubb,rt dailydelivery black friday deal up to off sandisk microsdxc cards licensed for nintendo switch rated,101501,Cleveland,,2022-11-25T16:59:13.000+0000
1596186756382089216,Missouri Workers Center,moworkerscenter,rt africanowonline on black friday amazonworkers in countries strike and protest despicable treatment via,450,"St Louis, MO",,2022-11-25T16:59:13.000+0000


In [0]:
# Add column for hour
# df_date = df_date.withColumn('hour',F.hour(df_date.created_at))

In [0]:
# Add column for day of week
# df_date = df_date.withColumn('day_of_week',F.dayofweek(df_date.created_at))

In [0]:
# Add column for month
# df_date = df_date.withColumn('month',F.month(df_date.created_at))

In [0]:
# Add column for day of month
# df_date = df_date.withColumn('day_of_month',F.dayofmonth(df_date.created_at))

In [0]:
# display(df_date.take(10))

id,name,screen_name,tweet,followers_count,location,geo,created_at,hour,day_of_week,month,day_of_month
1596186874502053888,Luke McClellan,mcclellan_luke,black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,429,"New Albany, OH",,2022-11-25T16:59:41.000+0000,16,6,11,25
1596186922812346368,Dr. Cosmin Buta,cosminache23,rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,1400,Iasi/Romania,,2022-11-25T16:59:52.000+0000,16,6,11,25
1596186961689067521,Geek Tech - High Tech News,GeekTech_App_EN,to mac happy hour iphone rumors homekit secure video apple black friday via,156,,,2022-11-25T17:00:02.000+0000,17,6,11,25
1596187031285235712,RWDFWD,rwdfwd,rt bangerjonny black friday you don t see supermarkets discounting their food by,2823,Bristol,,2022-11-25T17:00:18.000+0000,17,6,11,25
1596187070086668288,Vanny,CarlosArmendr14,rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,138,Mizuna Shione,,2022-11-25T17:00:27.000+0000,17,6,11,25
1596187077304938496,Joselito,TruestSelf14,rt pulte black friday sale my money off,65,,,2022-11-25T17:00:29.000+0000,17,6,11,25
1596187080878878721,Eric cavanaugh,CavanaughEric,black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,53,Omaha Nebraska,,2022-11-25T17:00:30.000+0000,17,6,11,25
1596187149292171265,callummsweeney,callummsweeney,rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,42,,,2022-11-25T17:00:46.000+0000,17,6,11,25
1596187154111430656,Peter,The_Only_One_01,rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,775,Nimmerland,,2022-11-25T17:00:47.000+0000,17,6,11,25
1596187180627533828,FredoInDaCut 📍,FredoInDaCut__,rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,371,"New York, USA",,2022-11-25T17:00:54.000+0000,17,6,11,25


In [0]:
# Check the schema of the converted and added columns
# df_date.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- tweet: string (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- geo: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)



In [0]:
# Drop unwanted features. The sentiment column will be dropped and will be re-added as issues were encountered when using the dataframe with further data cleaning in 'created_at' column.  
df_tweets = df_duplicate.select('tweet')
display(df_tweets)

tweet
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin
to mac happy hour iphone rumors homekit secure video apple black friday via
rt sexilexitrap happy black friday got a deal going on if you check me out today only
rt metavarce it s so thoughtful that tech stocks have been preparing their prices all year for black friday
rt your fut card giveaway day of our black friday week giveaways fifa points paypal cash discount code to
rt say blk these black friday sales kinda weak
rt rhyheimx black friday phatrabbitkill rhyheimx alternativax
would be awesome tradingview and crypto ed nl
rt chiefsaholic fuck your black friday it s red friday bitch chiefskingdom


In [0]:
# Cache the dataframe for faster iteration
df_tweets.cache() 

# Run the count action to materialize the cache
df_tweets.count()

Out[113]: 2090963

#### 4. Getting Labels

In [0]:
# Using TextBlob to assign the labels for the visualization
def get_sentiment(tweet):
    blob = TextBlob(tweet)
    sentiment = blob.sentiment.polarity
    if sentiment >= 0:
        return 'positive'
    else:
        return 'negative'

In [0]:
# Add the sentiment column
get_sentiment_udf = udf(get_sentiment, StringType())
df_label = df_tweets.withColumn("sentiment", get_sentiment_udf("tweet"))

display(df_label.take(20))

tweet,sentiment
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,negative
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,positive
to mac happy hour iphone rumors homekit secure video apple black friday via,positive
rt bangerjonny black friday you don t see supermarkets discounting their food by,negative
rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,negative
rt pulte black friday sale my money off,negative
black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,negative
rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,negative
rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,negative
rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,positive


#### 5. Feature Transformer: Tokenizer

In [0]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
tweets_tokenized = tokenizer.transform(df_label)

display(tweets_tokenized.take(10))

tweet,sentiment,tokens
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,negative,"List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)"
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,positive,"List(rt, not, okaybears, we, arrrrre, stuffed, bears, let, s, start, off, black, friday, with, something, light, amp, simple, have, notifications, on, we, re, goin)"
to mac happy hour iphone rumors homekit secure video apple black friday via,positive,"List(to, mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)"
rt bangerjonny black friday you don t see supermarkets discounting their food by,negative,"List(rt, bangerjonny, black, friday, you, don, t, see, supermarkets, discounting, their, food, by)"
rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,negative,"List(rt, jennvandamsel, black, friday, on, my, onlyfans, off, my, onlyfans, subs, making, it, only, for, the, weekend, i, just, posted, some, jv)"
rt pulte black friday sale my money off,negative,"List(rt, pulte, black, friday, sale, my, money, off)"
black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,negative,"List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)"
rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,negative,"List(rt, futsheriff, black, friday, promo, is, live, now, k, fifa, points, giveaway, for, you, to, enter, rt, follow, me, iconsquadgaming)"
rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,negative,"List(rt, skylersquirt, we, both, got, our, black, friday, deal, in, early, can, t, wait, for, the, next, deal, tonyropebbc, cataliavalentin, blackfrida)"
rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,positive,"List(rt, millions, we, re, kicking, off, black, friday, with, a, giveaway, rt, and, follow, millions, for, your, chance, to, win, where, will, you, be, shopp)"


#### 6. Feature Transformer: Stopword Removal

In [0]:
#Remove stopwords from the review(list of words) 
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tweets_stopword = stopword_remover.transform(tweets_tokenized)

display(tweets_stopword.take(10))

tweet,sentiment,tokens,filtered
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,negative,"List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)"
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,positive,"List(rt, not, okaybears, we, arrrrre, stuffed, bears, let, s, start, off, black, friday, with, something, light, amp, simple, have, notifications, on, we, re, goin)","List(rt, okaybears, arrrrre, stuffed, bears, let, start, black, friday, something, light, amp, simple, notifications, re, goin)"
to mac happy hour iphone rumors homekit secure video apple black friday via,positive,"List(to, mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","List(mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)"
rt bangerjonny black friday you don t see supermarkets discounting their food by,negative,"List(rt, bangerjonny, black, friday, you, don, t, see, supermarkets, discounting, their, food, by)","List(rt, bangerjonny, black, friday, see, supermarkets, discounting, food)"
rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,negative,"List(rt, jennvandamsel, black, friday, on, my, onlyfans, off, my, onlyfans, subs, making, it, only, for, the, weekend, i, just, posted, some, jv)","List(rt, jennvandamsel, black, friday, onlyfans, onlyfans, subs, making, weekend, posted, jv)"
rt pulte black friday sale my money off,negative,"List(rt, pulte, black, friday, sale, my, money, off)","List(rt, pulte, black, friday, sale, money)"
black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,negative,"List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)"
rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,negative,"List(rt, futsheriff, black, friday, promo, is, live, now, k, fifa, points, giveaway, for, you, to, enter, rt, follow, me, iconsquadgaming)","List(rt, futsheriff, black, friday, promo, live, k, fifa, points, giveaway, enter, rt, follow, iconsquadgaming)"
rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,negative,"List(rt, skylersquirt, we, both, got, our, black, friday, deal, in, early, can, t, wait, for, the, next, deal, tonyropebbc, cataliavalentin, blackfrida)","List(rt, skylersquirt, got, black, friday, deal, early, wait, next, deal, tonyropebbc, cataliavalentin, blackfrida)"
rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,positive,"List(rt, millions, we, re, kicking, off, black, friday, with, a, giveaway, rt, and, follow, millions, for, your, chance, to, win, where, will, you, be, shopp)","List(rt, millions, re, kicking, black, friday, giveaway, rt, follow, millions, chance, win, shopp)"


#### 7. Feature Transformer: CountVectorizer (TF-Term Frequency)

In [0]:
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
cv_model = cv.fit(tweets_stopword)
tweets_cv = cv_model.transform(tweets_stopword)

display(tweets_cv.take(10))

tweet,sentiment,tokens,filtered,cv
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,negative,"List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(1.0, 1.0, 1.0, 3.0, 1.0))"
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,positive,"List(rt, not, okaybears, we, arrrrre, stuffed, bears, let, s, start, off, black, friday, with, something, light, amp, simple, have, notifications, on, we, re, goin)","List(rt, okaybears, arrrrre, stuffed, bears, let, start, black, friday, something, light, amp, simple, notifications, re, goin)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
to mac happy hour iphone rumors homekit secure video apple black friday via,positive,"List(to, mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","List(mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
rt bangerjonny black friday you don t see supermarkets discounting their food by,negative,"List(rt, bangerjonny, black, friday, you, don, t, see, supermarkets, discounting, their, food, by)","List(rt, bangerjonny, black, friday, see, supermarkets, discounting, food)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,negative,"List(rt, jennvandamsel, black, friday, on, my, onlyfans, off, my, onlyfans, subs, making, it, only, for, the, weekend, i, just, posted, some, jv)","List(rt, jennvandamsel, black, friday, onlyfans, onlyfans, subs, making, weekend, posted, jv)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
rt pulte black friday sale my money off,negative,"List(rt, pulte, black, friday, sale, my, money, off)","List(rt, pulte, black, friday, sale, money)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,negative,"List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(1.0, 1.0, 1.0, 3.0, 1.0, 1.0))"
rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,negative,"List(rt, futsheriff, black, friday, promo, is, live, now, k, fifa, points, giveaway, for, you, to, enter, rt, follow, me, iconsquadgaming)","List(rt, futsheriff, black, friday, promo, live, k, fifa, points, giveaway, enter, rt, follow, iconsquadgaming)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,negative,"List(rt, skylersquirt, we, both, got, our, black, friday, deal, in, early, can, t, wait, for, the, next, deal, tonyropebbc, cataliavalentin, blackfrida)","List(rt, skylersquirt, got, black, friday, deal, early, wait, next, deal, tonyropebbc, cataliavalentin, blackfrida)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,positive,"List(rt, millions, we, re, kicking, off, black, friday, with, a, giveaway, rt, and, follow, millions, for, your, chance, to, win, where, will, you, be, shopp)","List(rt, millions, re, kicking, black, friday, giveaway, rt, follow, millions, chance, win, shopp)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0))"


#### 8. Feature Transformer: TF-IDF Vectorization

In [0]:
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
idf_model = idf.fit(tweets_cv)
tweets_idf = idf_model.transform(tweets_cv)

display(tweets_idf.take(10))

tweet,sentiment,tokens,filtered,cv,features
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,negative,"List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(1.0, 1.0, 1.0, 3.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(0.22124205159810534, 0.2311642913994614, 3.465898520446778, 14.88056872704463, 5.057541262266511))"
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,positive,"List(rt, not, okaybears, we, arrrrre, stuffed, bears, let, s, start, off, black, friday, with, something, light, amp, simple, have, notifications, on, we, re, goin)","List(rt, okaybears, arrrrre, stuffed, bears, let, start, black, friday, something, light, amp, simple, notifications, re, goin)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 2.4724222782541503, 3.0758996523915516, 4.37874359437121, 4.559259209355999, 5.355374171850485, 5.92386466679822, 6.8683518180970236, 6.904395972663566, 8.51526484169767, 8.647773913565239, 8.751017386242745, 9.765644018837762, 11.845085560517598))"
to mac happy hour iphone rumors homekit secure video apple black friday via,positive,"List(to, mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","List(mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(0.22124205159810534, 0.2311642913994614, 3.4507381991002064, 4.210522689482902, 5.246122502383364, 5.402333102359822, 6.09463156655225, 6.242720767431516, 7.301790778247594, 7.411890639269317, 9.693323357258135, 10.087227642965225))"
rt bangerjonny black friday you don t see supermarkets discounting their food by,negative,"List(rt, bangerjonny, black, friday, you, don, t, see, supermarkets, discounting, their, food, by)","List(rt, bangerjonny, black, friday, see, supermarkets, discounting, food)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 4.072473204776156, 6.3926175141423025, 9.220416968354439, 10.681934750711918, 11.557403488065816))"
rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,negative,"List(rt, jennvandamsel, black, friday, on, my, onlyfans, off, my, onlyfans, subs, making, it, only, for, the, weekend, i, just, posted, some, jv)","List(rt, jennvandamsel, black, friday, onlyfans, onlyfans, subs, making, weekend, posted, jv)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 6.902019212696891, 3.593021786415995, 5.185193529140135, 6.181199582860709, 6.2048347066858645, 10.235647648083498, 10.426001376574716))"
rt pulte black friday sale my money off,negative,"List(rt, pulte, black, friday, sale, my, money, off)","List(rt, pulte, black, friday, sale, money)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 1.61228227050308, 4.694279533229274, 7.165426522538767))"
black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,negative,"List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(1.0, 1.0, 1.0, 3.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(0.22124205159810534, 0.2311642913994614, 3.465898520446778, 14.88056872704463, 5.057541262266511, 8.205746551963797))"
rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,negative,"List(rt, futsheriff, black, friday, promo, is, live, now, k, fifa, points, giveaway, for, you, to, enter, rt, follow, me, iconsquadgaming)","List(rt, futsheriff, black, friday, promo, live, k, fifa, points, giveaway, enter, rt, follow, iconsquadgaming)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(0.22124205159810534, 0.2311642913994614, 0.672298750974662, 2.568788922878726, 3.169351340972809, 3.8368308856858007, 4.197235903868348, 4.36079182640413, 4.427865583408812, 4.70078362119845, 4.675992184423282, 5.292577673483008, 6.02817283293921))"
rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,negative,"List(rt, skylersquirt, we, both, got, our, black, friday, deal, in, early, can, t, wait, for, the, next, deal, tonyropebbc, cataliavalentin, blackfrida)","List(rt, skylersquirt, got, black, friday, deal, early, wait, next, deal, tonyropebbc, cataliavalentin, blackfrida)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 3.0172069148846163, 6.335814627639553, 4.208783782623654, 4.88524396833713, 5.187331287019553, 8.36282035576666, 11.662764003723643, 11.662764003723643, 11.780547039380027))"
rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,positive,"List(rt, millions, we, re, kicking, off, black, friday, with, a, giveaway, rt, and, follow, millions, for, your, chance, to, win, where, will, you, be, shopp)","List(rt, millions, re, kicking, black, friday, giveaway, rt, follow, millions, chance, win, shopp)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(0.22124205159810534, 0.2311642913994614, 0.672298750974662, 2.568788922878726, 3.0758996523915516, 3.169351340972809, 3.776975042365248, 4.152424675735975, 12.908985835720781, 6.572086001953851, 6.821643732374124))"


#### 9. Label Encoder

In [0]:
label_encoder = StringIndexer(inputCol = "sentiment", outputCol = "label")
le_model = label_encoder.fit(tweets_idf)
tweets_label = le_model.transform(tweets_idf)

display(tweets_label.take(10))

tweet,sentiment,tokens,filtered,cv,features,label
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,negative,"List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(1.0, 1.0, 1.0, 3.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(0.22124205159810534, 0.2311642913994614, 3.465898520446778, 14.88056872704463, 5.057541262266511))",0.0
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,positive,"List(rt, not, okaybears, we, arrrrre, stuffed, bears, let, s, start, off, black, friday, with, something, light, amp, simple, have, notifications, on, we, re, goin)","List(rt, okaybears, arrrrre, stuffed, bears, let, start, black, friday, something, light, amp, simple, notifications, re, goin)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 2.4724222782541503, 3.0758996523915516, 4.37874359437121, 4.559259209355999, 5.355374171850485, 5.92386466679822, 6.8683518180970236, 6.904395972663566, 8.51526484169767, 8.647773913565239, 8.751017386242745, 9.765644018837762, 11.845085560517598))",1.0
to mac happy hour iphone rumors homekit secure video apple black friday via,positive,"List(to, mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","List(mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(0.22124205159810534, 0.2311642913994614, 3.4507381991002064, 4.210522689482902, 5.246122502383364, 5.402333102359822, 6.09463156655225, 6.242720767431516, 7.301790778247594, 7.411890639269317, 9.693323357258135, 10.087227642965225))",1.0
rt bangerjonny black friday you don t see supermarkets discounting their food by,negative,"List(rt, bangerjonny, black, friday, you, don, t, see, supermarkets, discounting, their, food, by)","List(rt, bangerjonny, black, friday, see, supermarkets, discounting, food)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 4.072473204776156, 6.3926175141423025, 9.220416968354439, 10.681934750711918, 11.557403488065816))",0.0
rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,negative,"List(rt, jennvandamsel, black, friday, on, my, onlyfans, off, my, onlyfans, subs, making, it, only, for, the, weekend, i, just, posted, some, jv)","List(rt, jennvandamsel, black, friday, onlyfans, onlyfans, subs, making, weekend, posted, jv)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 6.902019212696891, 3.593021786415995, 5.185193529140135, 6.181199582860709, 6.2048347066858645, 10.235647648083498, 10.426001376574716))",0.0
rt pulte black friday sale my money off,negative,"List(rt, pulte, black, friday, sale, my, money, off)","List(rt, pulte, black, friday, sale, money)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 1.61228227050308, 4.694279533229274, 7.165426522538767))",0.0
black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,negative,"List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(1.0, 1.0, 1.0, 3.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(0.22124205159810534, 0.2311642913994614, 3.465898520446778, 14.88056872704463, 5.057541262266511, 8.205746551963797))",0.0
rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,negative,"List(rt, futsheriff, black, friday, promo, is, live, now, k, fifa, points, giveaway, for, you, to, enter, rt, follow, me, iconsquadgaming)","List(rt, futsheriff, black, friday, promo, live, k, fifa, points, giveaway, enter, rt, follow, iconsquadgaming)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(0.22124205159810534, 0.2311642913994614, 0.672298750974662, 2.568788922878726, 3.169351340972809, 3.8368308856858007, 4.197235903868348, 4.36079182640413, 4.427865583408812, 4.70078362119845, 4.675992184423282, 5.292577673483008, 6.02817283293921))",0.0
rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,negative,"List(rt, skylersquirt, we, both, got, our, black, friday, deal, in, early, can, t, wait, for, the, next, deal, tonyropebbc, cataliavalentin, blackfrida)","List(rt, skylersquirt, got, black, friday, deal, early, wait, next, deal, tonyropebbc, cataliavalentin, blackfrida)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 3.0172069148846163, 6.335814627639553, 4.208783782623654, 4.88524396833713, 5.187331287019553, 8.36282035576666, 11.662764003723643, 11.662764003723643, 11.780547039380027))",0.0
rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,positive,"List(rt, millions, we, re, kicking, off, black, friday, with, a, giveaway, rt, and, follow, millions, for, your, chance, to, win, where, will, you, be, shopp)","List(rt, millions, re, kicking, black, friday, giveaway, rt, follow, millions, chance, win, shopp)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(0.22124205159810534, 0.2311642913994614, 0.672298750974662, 2.568788922878726, 3.0758996523915516, 3.169351340972809, 3.776975042365248, 4.152424675735975, 12.908985835720781, 6.572086001953851, 6.821643732374124))",1.0


In [0]:
# Cache the dataframe for faster iteration
tweets_label.cache() 

# Run the count action to materialize the cache
tweets_label.count()

Out[121]: 2090963

In [0]:
tweets_label.select('sentiment', 'label').show(20)

+---------+-----+
|sentiment|label|
+---------+-----+
| negative|  0.0|
| positive|  1.0|
| positive|  1.0|
| negative|  0.0|
| negative|  0.0|
| negative|  0.0|
| negative|  0.0|
| negative|  0.0|
| negative|  0.0|
| positive|  1.0|
| positive|  1.0|
| negative|  0.0|
| negative|  0.0|
| negative|  0.0|
| negative|  0.0|
| positive|  1.0|
| negative|  0.0|
| negative|  0.0|
| positive|  1.0|
| negative|  0.0|
+---------+-----+
only showing top 20 rows



In [0]:
# Checking the distribution
tweets_label.groupBy("label").count().show()

+-----+-------+
|label|  count|
+-----+-------+
|  0.0|1133337|
|  1.0| 957626|
+-----+-------+



#### 10. Machine Learning

In [0]:
# Use 90% cases for training, 10% cases for testing. Logistic Regression is used for our baseline model. 

train, test = tweets_label.randomSplit([0.9, 0.1], seed=20200819)

lr = LogisticRegression(maxIter=100)

lr_model = lr.fit(tweets_label)

predictions = lr_model.transform(tweets_label)

display(predictions)

tweet,sentiment,tokens,filtered,cv,features,label,rawPrediction,probability,prediction
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,negative,"List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(1.0, 1.0, 1.0, 3.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(0.22124205159810534, 0.2311642913994614, 3.465898520446778, 14.88056872704463, 5.057541262266511))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(5.763299008669113, -5.763299008669113))","Map(vectorType -> dense, length -> 2, values -> List(0.9968691001384681, 0.0031308998615319217))",0.0
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,positive,"List(rt, not, okaybears, we, arrrrre, stuffed, bears, let, s, start, off, black, friday, with, something, light, amp, simple, have, notifications, on, we, re, goin)","List(rt, okaybears, arrrrre, stuffed, bears, let, start, black, friday, something, light, amp, simple, notifications, re, goin)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 2.4724222782541503, 3.0758996523915516, 4.37874359437121, 4.559259209355999, 5.355374171850485, 5.92386466679822, 6.8683518180970236, 6.904395972663566, 8.51526484169767, 8.647773913565239, 8.751017386242745, 9.765644018837762, 11.845085560517598))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-15.565765884610585, 15.565765884610585))","Map(vectorType -> dense, length -> 2, values -> List(1.7372954981411308E-7, 0.9999998262704501))",1.0
to mac happy hour iphone rumors homekit secure video apple black friday via,positive,"List(to, mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","List(mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(0.22124205159810534, 0.2311642913994614, 3.4507381991002064, 4.210522689482902, 5.246122502383364, 5.402333102359822, 6.09463156655225, 6.242720767431516, 7.301790778247594, 7.411890639269317, 9.693323357258135, 10.087227642965225))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-21.625448520103877, 21.625448520103877))","Map(vectorType -> dense, length -> 2, values -> List(4.056832145935609E-10, 0.9999999995943167))",1.0
rt bangerjonny black friday you don t see supermarkets discounting their food by,negative,"List(rt, bangerjonny, black, friday, you, don, t, see, supermarkets, discounting, their, food, by)","List(rt, bangerjonny, black, friday, see, supermarkets, discounting, food)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 4.072473204776156, 6.3926175141423025, 9.220416968354439, 10.681934750711918, 11.557403488065816))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(7.942060788270803, -7.942060788270803))","Map(vectorType -> dense, length -> 2, values -> List(0.9996446531482385, 3.553468517615288E-4))",0.0
rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,negative,"List(rt, jennvandamsel, black, friday, on, my, onlyfans, off, my, onlyfans, subs, making, it, only, for, the, weekend, i, just, posted, some, jv)","List(rt, jennvandamsel, black, friday, onlyfans, onlyfans, subs, making, weekend, posted, jv)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 6.902019212696891, 3.593021786415995, 5.185193529140135, 6.181199582860709, 6.2048347066858645, 10.235647648083498, 10.426001376574716))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(4.172438934294684, -4.172438934294684))","Map(vectorType -> dense, length -> 2, values -> List(0.9848193844079608, 0.015180615592039204))",0.0
rt pulte black friday sale my money off,negative,"List(rt, pulte, black, friday, sale, my, money, off)","List(rt, pulte, black, friday, sale, money)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 1.61228227050308, 4.694279533229274, 7.165426522538767))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(5.458584575076463, -5.458584575076463))","Map(vectorType -> dense, length -> 2, values -> List(0.995758486472236, 0.004241513527764029))",0.0
black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,negative,"List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(1.0, 1.0, 1.0, 3.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(0.22124205159810534, 0.2311642913994614, 3.465898520446778, 14.88056872704463, 5.057541262266511, 8.205746551963797))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(4.933755998016775, -4.933755998016775))","Map(vectorType -> dense, length -> 2, values -> List(0.9928520495765439, 0.007147950423456062))",0.0
rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,negative,"List(rt, futsheriff, black, friday, promo, is, live, now, k, fifa, points, giveaway, for, you, to, enter, rt, follow, me, iconsquadgaming)","List(rt, futsheriff, black, friday, promo, live, k, fifa, points, giveaway, enter, rt, follow, iconsquadgaming)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(0.22124205159810534, 0.2311642913994614, 0.672298750974662, 2.568788922878726, 3.169351340972809, 3.8368308856858007, 4.197235903868348, 4.36079182640413, 4.427865583408812, 4.70078362119845, 4.675992184423282, 5.292577673483008, 6.02817283293921))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(9.60319483588294, -9.60319483588294))","Map(vectorType -> dense, length -> 2, values -> List(0.9999324918580788, 6.750814192124643E-5))",0.0
rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,negative,"List(rt, skylersquirt, we, both, got, our, black, friday, deal, in, early, can, t, wait, for, the, next, deal, tonyropebbc, cataliavalentin, blackfrida)","List(rt, skylersquirt, got, black, friday, deal, early, wait, next, deal, tonyropebbc, cataliavalentin, blackfrida)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 3.0172069148846163, 6.335814627639553, 4.208783782623654, 4.88524396833713, 5.187331287019553, 8.36282035576666, 11.662764003723643, 11.662764003723643, 11.780547039380027))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(19.468820954406333, -19.468820954406333))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999964941084, 3.5058915770491694E-9))",0.0
rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,positive,"List(rt, millions, we, re, kicking, off, black, friday, with, a, giveaway, rt, and, follow, millions, for, your, chance, to, win, where, will, you, be, shopp)","List(rt, millions, re, kicking, black, friday, giveaway, rt, follow, millions, chance, win, shopp)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(0.22124205159810534, 0.2311642913994614, 0.672298750974662, 2.568788922878726, 3.0758996523915516, 3.169351340972809, 3.776975042365248, 4.152424675735975, 12.908985835720781, 6.572086001953851, 6.821643732374124))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-9.465585205386596, 9.465585205386596))","Map(vectorType -> dense, length -> 2, values -> List(7.746667802245283E-5, 0.9999225333219776))",1.0


#### 11. Model Evaluation

In [0]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
roc_auc = evaluator.evaluate(predictions)
#roc_auc = evaluator.evaluate(predictions, BinaryClassificationEvaluatorMetricType='areaUnderROC')
#accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.9902
ROC-AUC: 0.9984


In [0]:
(predictions
.write
.mode("overwrite")
.parquet('/mnt/my_bucket/BlackFriday_predictions.parquet'))

In [0]:
display(predictions)

tweet,sentiment,tokens,filtered,cv,features,label,rawPrediction,probability,prediction
black friday shopping links gt gt gt colbycrudenn joeburrrrr joshglaser noluckneal,negative,"List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","List(black, friday, shopping, links, gt, gt, gt, colbycrudenn, joeburrrrr, joshglaser, noluckneal)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(1.0, 1.0, 1.0, 3.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220), values -> List(0.22124205159810534, 0.2311642913994614, 3.465898520446778, 14.88056872704463, 5.057541262266511))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(5.763299008669113, -5.763299008669113))","Map(vectorType -> dense, length -> 2, values -> List(0.9968691001384681, 0.0031308998615319217))",0.0
rt not okaybears we arrrrre stuffed bears let s start off black friday with something light amp simple have notifications on we re goin,positive,"List(rt, not, okaybears, we, arrrrre, stuffed, bears, let, s, start, off, black, friday, with, something, light, amp, simple, have, notifications, on, we, re, goin)","List(rt, okaybears, arrrrre, stuffed, bears, let, start, black, friday, something, light, amp, simple, notifications, re, goin)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 4, 14, 109, 139, 276, 538, 1113, 1174, 4194, 4580, 4893, 7227, 36742), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 2.4724222782541503, 3.0758996523915516, 4.37874359437121, 4.559259209355999, 5.355374171850485, 5.92386466679822, 6.8683518180970236, 6.904395972663566, 8.51526484169767, 8.647773913565239, 8.751017386242745, 9.765644018837762, 11.845085560517598))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-15.565765884610585, 15.565765884610585))","Map(vectorType -> dense, length -> 2, values -> List(1.7372954981411308E-7, 0.9999998262704501))",1.0
to mac happy hour iphone rumors homekit secure video apple black friday via,positive,"List(to, mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","List(mac, happy, hour, iphone, rumors, homekit, secure, video, apple, black, friday, via)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 31, 82, 284, 305, 615, 702, 1612, 1741, 9375, 12250), values -> List(0.22124205159810534, 0.2311642913994614, 3.4507381991002064, 4.210522689482902, 5.246122502383364, 5.402333102359822, 6.09463156655225, 6.242720767431516, 7.301790778247594, 7.411890639269317, 9.693323357258135, 10.087227642965225))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-21.625448520103877, 21.625448520103877))","Map(vectorType -> dense, length -> 2, values -> List(4.056832145935609E-10, 0.9999999995943167))",1.0
rt bangerjonny black friday you don t see supermarkets discounting their food by,negative,"List(rt, bangerjonny, black, friday, you, don, t, see, supermarkets, discounting, their, food, by)","List(rt, bangerjonny, black, friday, see, supermarkets, discounting, food)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 68, 783, 6871, 17529, 29746), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 4.072473204776156, 6.3926175141423025, 9.220416968354439, 10.681934750711918, 11.557403488065816))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(7.942060788270803, -7.942060788270803))","Map(vectorType -> dense, length -> 2, values -> List(0.9996446531482385, 3.553468517615288E-4))",0.0
rt jennvandamsel black friday on my onlyfans off my onlyfans subs making it only for the weekend i just posted some jv,negative,"List(rt, jennvandamsel, black, friday, on, my, onlyfans, off, my, onlyfans, subs, making, it, only, for, the, weekend, i, just, posted, some, jv)","List(rt, jennvandamsel, black, friday, onlyfans, onlyfans, subs, making, weekend, posted, jv)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 29, 38, 225, 671, 680, 12971, 14948), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 6.902019212696891, 3.593021786415995, 5.185193529140135, 6.181199582860709, 6.2048347066858645, 10.235647648083498, 10.426001376574716))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(4.172438934294684, -4.172438934294684))","Map(vectorType -> dense, length -> 2, values -> List(0.9848193844079608, 0.015180615592039204))",0.0
rt pulte black friday sale my money off,negative,"List(rt, pulte, black, friday, sale, my, money, off)","List(rt, pulte, black, friday, sale, money)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 3, 150, 1439), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 1.61228227050308, 4.694279533229274, 7.165426522538767))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(5.458584575076463, -5.458584575076463))","Map(vectorType -> dense, length -> 2, values -> List(0.995758486472236, 0.004241513527764029))",0.0
black friday shopping links gt gt gt hauptman jeff yasminlane hoppyq devineden,negative,"List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","List(black, friday, shopping, links, gt, gt, gt, hauptman, jeff, yasminlane, hoppyq, devineden)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(1.0, 1.0, 1.0, 3.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 32, 103, 220, 3325), values -> List(0.22124205159810534, 0.2311642913994614, 3.465898520446778, 14.88056872704463, 5.057541262266511, 8.205746551963797))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(4.933755998016775, -4.933755998016775))","Map(vectorType -> dense, length -> 2, values -> List(0.9928520495765439, 0.007147950423456062))",0.0
rt futsheriff black friday promo is live now k fifa points giveaway for you to enter rt follow me iconsquadgaming,negative,"List(rt, futsheriff, black, friday, promo, is, live, now, k, fifa, points, giveaway, for, you, to, enter, rt, follow, me, iconsquadgaming)","List(rt, futsheriff, black, friday, promo, live, k, fifa, points, giveaway, enter, rt, follow, iconsquadgaming)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 21, 52, 80, 90, 115, 151, 154, 204, 586), values -> List(0.22124205159810534, 0.2311642913994614, 0.672298750974662, 2.568788922878726, 3.169351340972809, 3.8368308856858007, 4.197235903868348, 4.36079182640413, 4.427865583408812, 4.70078362119845, 4.675992184423282, 5.292577673483008, 6.02817283293921))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(9.60319483588294, -9.60319483588294))","Map(vectorType -> dense, length -> 2, values -> List(0.9999324918580788, 6.750814192124643E-5))",0.0
rt skylersquirt we both got our black friday deal in early can t wait for the next deal tonyropebbc cataliavalentin blackfrida,negative,"List(rt, skylersquirt, we, both, got, our, black, friday, deal, in, early, can, t, wait, for, the, next, deal, tonyropebbc, cataliavalentin, blackfrida)","List(rt, skylersquirt, got, black, friday, deal, early, wait, next, deal, tonyropebbc, cataliavalentin, blackfrida)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 10, 22, 77, 187, 250, 3744, 32308, 32662, 34433), values -> List(0.22124205159810534, 0.2311642913994614, 0.336149375487331, 3.0172069148846163, 6.335814627639553, 4.208783782623654, 4.88524396833713, 5.187331287019553, 8.36282035576666, 11.662764003723643, 11.662764003723643, 11.780547039380027))",0.0,"Map(vectorType -> dense, length -> 2, values -> List(19.468820954406333, -19.468820954406333))","Map(vectorType -> dense, length -> 2, values -> List(0.9999999964941084, 3.5058915770491694E-9))",0.0
rt millions we re kicking off black friday with a giveaway rt and follow millions for your chance to win where will you be shopp,positive,"List(rt, millions, we, re, kicking, off, black, friday, with, a, giveaway, rt, and, follow, millions, for, your, chance, to, win, where, will, you, be, shopp)","List(rt, millions, re, kicking, black, friday, giveaway, rt, follow, millions, chance, win, shopp)","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 65536, indices -> List(0, 1, 2, 6, 14, 21, 46, 76, 562, 909, 1105), values -> List(0.22124205159810534, 0.2311642913994614, 0.672298750974662, 2.568788922878726, 3.0758996523915516, 3.169351340972809, 3.776975042365248, 4.152424675735975, 12.908985835720781, 6.572086001953851, 6.821643732374124))",1.0,"Map(vectorType -> dense, length -> 2, values -> List(-9.465585205386596, 9.465585205386596))","Map(vectorType -> dense, length -> 2, values -> List(7.746667802245283E-5, 0.9999225333219776))",1.0


In [0]:
# Get the schema of predictions for table creation in Athena
predictions.printSchema()

root
 |-- tweet: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cv: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



#### 11. Putting a Pipeline

In [0]:
# Use 90% cases for training, 10% cases for testing

train, test = df_label.randomSplit([0.9, 0.1], seed=20200819)

# Creating the transformers for the pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_encoder = StringIndexer(inputCol = "sentiment", outputCol = "label")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, stopword_remover, cv, idf, label_encoder, lr])

pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.9823
ROC-AUC: 0.9955


In [0]:
(predictions
.write
.mode("overwrite")
.parquet('/mnt/my_bucket/BlackFriday_pipe.parquet'))

#### 11. Ngram Features

In [0]:
# Use 90% cases for training, 10% cases for testing

train, test = df_label.randomSplit([0.9, 0.1], seed=20200819)

# label
label_encoder= StringIndexer(inputCol = "sentiment", outputCol = "label")

# Create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5) 
ngram = NGram(n=2, inputCol="filtered", outputCol="2gram")
ngram_hashingtf = HashingTF(inputCol="2gram", outputCol="2gram_tf", numFeatures=20000)
ngram_idf = IDF(inputCol='2gram_tf', outputCol="2gram_idf", minDocFreq=5) 

# Assemble all text features
assembler = VectorAssembler(inputCols=["1gram_idf", "2gram_tf"], outputCol="rawFeatures")

# Chi-square variable selection
selector = ChiSqSelector(numTopFeatures=2**14,featuresCol='rawFeatures', outputCol="features")

# Regression model estimator
lr = LogisticRegression(maxIter=100)

# Build the pipeline
pipeline = Pipeline(stages=[label_encoder, tokenizer, stopword_remover, cv, idf, ngram, ngram_hashingtf, ngram_idf, assembler, selector, lr])

# Pipeline model fitting
pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(test.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

In [0]:
(predictions
.write
.mode("overwrite")
.parquet('/mnt/my_bucket/BlackFriday_ngram.parquet'))