__Imports__

In [1]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from google_drive_downloader import GoogleDriveDownloader as gdd
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
import nltk
from nltk.corpus import stopwords
import string
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

__Download File from link given in Canvas__

This will be stored into your local...Do not add into git, file is too large to be pushed onto git master branch

So, we download locally

In [2]:
gdd.download_file_from_google_drive(file_id='0B04GJPshIjmPRnZManQwWEdTZjg',
                                    dest_path='/Users/mwoo/Downloads/trainingandtestdata.zip',
                                    unzip=True)

In [3]:
# gdd.download_file_from_google_drive(file_id='0B04GJPshIjmPRnZManQwWEdTZjg',
#                                     dest_path='/Users/swapnilbasu/Downloads/trainingandtestdata.zip',
#                                     unzip=True)

__Create spark session object (Data Processing)__

In [4]:
spark=SparkSession.builder.appName('classification_tweet').getOrCreate()

__Load in data__

In [5]:
training_data = spark.read.csv("/Users/mwoo/Downloads/training.1600000.processed.noemoticon.csv",header=False)

__Renaming columns__

In [6]:
training_data.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5']

In [7]:
training_data = training_data.toDF("target",'id','date','query','user_name','text')

In [8]:
training_data.columns

['target', 'id', 'date', 'query', 'user_name', 'text']

__Exploratory__

In [9]:
training_data.describe()

DataFrame[summary: string, target: string, id: string, date: string, query: string, user_name: string, text: string]

__Selecting the target value and text__

In [10]:
df = training_data.select('text','target')

In [11]:
df.show(5)

+--------------------+------+
|                text|target|
+--------------------+------+
|@switchfoot http:...|     0|
|is upset that he ...|     0|
|@Kenichan I dived...|     0|
|my whole body fee...|     0|
|@nationwideclass ...|     0|
+--------------------+------+
only showing top 5 rows



In [12]:
df.printSchema()

root
 |-- text: string (nullable = true)
 |-- target: string (nullable = true)



We can see below that its an even split between positive and negative tweets

0: negative
4: positive

In [13]:
df.groupBy("target").count().orderBy(col("count").desc()).show()

+------+------+
|target| count|
+------+------+
|     0|800000|
|     4|800000|
+------+------+



__Model Pipeline__

__Regular Expression Tokenizer__

Seperates the texts into words

In [14]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

__Stop Words Download from NLTK__

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/mwoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

__Stop Words Remover__

Remove unnecessary words

In [16]:
sp = set(string.punctuation)
stop_words = set(stopwords.words('english'))
extra_words = {"http","https","amp","rt","t","c","the"}
for i in extra_words:
    stop_words.add(i) 
stop_words = list(stop_words)

In [17]:
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)

__Bag of words count__

This is a type of feature engineering

In [18]:
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

__StringIndexer__

Indexing target values

In [19]:
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")

In [20]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(5)

+--------------------+------+--------------------+--------------------+--------------------+-----+
|                text|target|               words|            filtered|            features|label|
+--------------------+------+--------------------+--------------------+--------------------+-----+
|@switchfoot http:...|     0|[switchfoot, http...|[switchfoot, twit...|(10000,[1,10,16,6...|  0.0|
|is upset that he ...|     0|[is, upset, that,...|[upset, update, f...|(10000,[6,70,172,...|  0.0|
|@Kenichan I dived...|     0|[kenichan, i, div...|[kenichan, dived,...|(10000,[4,213,251...|  0.0|
|my whole body fee...|     0|[my, whole, body,...|[whole, body, fee...|(10000,[3,325,374...|  0.0|
|@nationwideclass ...|     0|[nationwideclass,...|[nationwideclass,...|(10000,[20,486],[...|  0.0|
+--------------------+------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



__Selecting data from the previous dataframe__

In [21]:
dataset = dataset.select('text','features','label')

__Set seed for reproducibility__

This set is use for testing purposes 70/30 split

In [22]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 1120280
Test Dataset Count: 479720


This will be used to fully train our classifcation model

In [23]:
model_df = dataset.select('features','label')
print("Full Training Dataset Count: " + str(model_df.count()))

Full Training Dataset Count: 1600000


__Testing our model through the split data above__

In [24]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0).fit(trainingData)
predictions = lr.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions = lr.transform(testData)
predictions.show(10)

+------------------------------+------------------------------+-----+----------+
|                          text|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|@KoolioHoolio see i didnt e...|[0.9983509482201712,0.00164...|  1.0|       0.0|
|you suck you suck you suck ...|[0.9956474979112804,0.00435...|  0.0|       0.0|
|super pissed that another t...|[0.9952937171301914,0.00470...|  0.0|       0.0|
|Things I'm feeling now: ang...|[0.9942597194931533,0.00574...|  0.0|       0.0|
|so sad, me equal sad, no so...|[0.9926117815441087,0.00738...|  0.0|       0.0|
|is feeling sad and stressed...|[0.9921148433314205,0.00788...|  0.0|       0.0|
|today i kinda feel sick of ...|[0.9918217293827665,0.00817...|  0.0|       0.0|
|Been sick with sore throat ...|[0.9900972897854806,0.00990...|  0.0|       0.0|
|Throat is killing me, runny...|[0.9884381180833721,0.01156...|  0.0|       0.0|
|Ugh my nose is stuffy, my t

In [25]:
predictions.printSchema()

root
 |-- text: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bce = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(bce.evaluate(predictions, {bce.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.8472088126136625


In [27]:
df_selection = predictions.select("label",'prediction').toPandas()

In [28]:
from sklearn.metrics import classification_report
true = np.array(df_selection['label'])
pred = np.array(df_selection['prediction'])
print(classification_report(true,pred))

              precision    recall  f1-score   support

         0.0       0.79      0.75      0.77    239942
         1.0       0.76      0.80      0.78    239778

    accuracy                           0.77    479720
   macro avg       0.77      0.77      0.77    479720
weighted avg       0.77      0.77      0.77    479720



__We can say that this model is able to distinguish whether a tweet is positive or negative is fair__

__Retrain our model using the fully dataset__

In [29]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0).fit(model_df)

__Twitter Authentication__

In [30]:
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream

ACCESS_TOKEN = "1458842253779161088-QFeO6udaAdHR4VARxaDza1w4LUlooE"
ACCESS_TOKEN_SECRET = "tC7IJDbl5T97Zvu3kE8sdGnmZWC2qxOrkdOv90YkdzIVO"
API_KEY = "KLP5ct26qaVo0KjAgP8O4j4y5"
API_KEY_SECRET = "AbxH3913WIPG0FHIwvVRomul92RWvuOdxRo2ecXR6H0Qgibo29"

auth = tweepy.OAuthHandler(API_KEY, API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = tweepy.API(auth)

__Twitter Streaming Tweets__

We can set the limit of tweet samples to 10 and where the streaming only streams tweets that are in english

In [31]:
tweet_list = list()
# Subclass Stream to print IDs of Tweets received
class IDPrinter(tweepy.Stream):
    
    def on_status(self, status):
        tweet_list.append(status.text)
        #print(tweet_list)
        #print(status.text)
        if len(tweet_list) == 100:
            Stream.disconnect(self)
# Initialize instance of the subclass
printer = IDPrinter(
  API_KEY, API_KEY_SECRET,
  ACCESS_TOKEN, ACCESS_TOKEN_SECRET
)

printer.sample(languages=['en'])

Stream connection closed by Twitter


__Create new dataframe from tweet stream__

In [32]:
df_2 = pd.DataFrame(np.array(tweet_list))
df_2.columns = ['text']
df_2 = spark.createDataFrame(df_2)
df_2.show()

+--------------------+
|                text|
+--------------------+
|RT @NotOwenMeany:...|
|RT @CryptoTownEU:...|
|RT @MCU_Source: B...|
|@composerchris Th...|
|i actually want t...|
|RT @emailmanROCKS...|
|RT @secretyarrow:...|
|RT @UKFeetToes: B...|
|RT @djrothkopf: M...|
|RT @itsfkntxna: y...|
|RT @Ahsanhoonmein...|
|@kaslina Yah...no...|
|RT @ohits_laurenn...|
|RT @zhanxfxiah: *...|
|@CSimmaWX @RColbu...|
|RT @lucifrian: ta...|
|get senjuro in th...|
|@Yeonderella99 Po...|
|RT @Matt_Stepp817...|
|Certain actions b...|
+--------------------+
only showing top 20 rows



__Transforms the data received from stream__

In [33]:
dataset_1 = pipelineFit.transform(df_2)
dataset_1.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|               words|            filtered|            features|
+--------------------+--------------------+--------------------+--------------------+
|RT @NotOwenMeany:...|[rt, notowenmeany...|[notowenmeany, dc...|(10000,[910,1208,...|
|RT @CryptoTownEU:...|[rt, cryptotowneu...|[cryptotowneu, ai...|(10000,[40,263,34...|
|RT @MCU_Source: B...|[rt, mcu_source, ...|[mcu_source, brea...|(10000,[90,1122,1...|
|@composerchris Th...|[composerchris, t...|[composerchris, l...|(10000,[883,1612,...|
|i actually want t...|[i, actually, wan...|[actually, want, ...|(10000,[26,187,73...|
|RT @emailmanROCKS...|[rt, emailmanrock...|[emailmanrocks, r...|(10000,[38,145,59...|
|RT @secretyarrow:...|[rt, secretyarrow...|[secretyarrow, lo...|(10000,[996,1122,...|
|RT @UKFeetToes: B...|[rt, ukfeettoes, ...|[ukfeettoes, biki...|(10000,[8,699,717...|
|RT @djrothkopf: M...|[rt, djrothkopf, ...|[djrothkopf

__Selecting text and Features__

In [34]:
dataset_test = dataset_1.select('text','features')

__Having the pre-trained model predict on tweets__

In [35]:
model_predictions = lr.transform(dataset_test)

In [36]:
model_predictions.show()

+--------------------+--------------------+--------------------+--------------------+----------+
|                text|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------+
|RT @NotOwenMeany:...|(10000,[910,1208,...|[-0.4205481364755...|[0.39638559333384...|       1.0|
|RT @CryptoTownEU:...|(10000,[40,263,34...|[-0.2426675444595...|[0.43962908141970...|       1.0|
|RT @MCU_Source: B...|(10000,[90,1122,1...|[-0.1111318162142...|[0.47224560461601...|       1.0|
|@composerchris Th...|(10000,[883,1612,...|[0.33328114913442...|[0.58255751614137...|       0.0|
|i actually want t...|(10000,[26,187,73...|[0.41763114414529...|[0.60291626474052...|       0.0|
|RT @emailmanROCKS...|(10000,[38,145,59...|[-1.3859129547787...|[0.20006103199723...|       1.0|
|RT @secretyarrow:...|(10000,[996,1122,...|[-0.1662206607879...|[0.45854024955908...|       1.0|
|RT @UKFeetToes: B...|(10000,[

__Storing Results and Text in Database__

Create connection to mongoDB, database, and table

In [37]:
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["my_database"]
mycol = mydb["predictions"]

Converting spark dataframe to pandas dataframe

In [41]:
mp_df = model_predictions.toPandas()
mp_df.head()

Unnamed: 0,text,features,rawPrediction,probability,prediction
0,RT @NotOwenMeany: DC Christmas Light Hunters: ...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.42054813647552053, 0.42054813647552053]","[0.39638559333384943, 0.6036144066661506]",1.0
1,RT @CryptoTownEU: 🚀 Airdrop: BAoE Global\n💰 Va...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.24266754445953495, 0.24266754445953495]","[0.43962908141970913, 0.5603709185802909]",1.0
2,RT @MCU_Source: BREAKING: #Deadpool‘s first ap...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.11113181621423963, 0.11113181621423963]","[0.47224560461601417, 0.5277543953839858]",1.0
3,@composerchris They should lose their tax exem...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.3332811491344227, -0.3332811491344227]","[0.5825575161413753, 0.4174424838586247]",0.0
4,i actually want to die.,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.417631144145294, -0.417631144145294]","[0.6029162647405205, 0.3970837352594795]",0.0


__Inserting into MongoDB__

In [42]:
for index, row in mp_df.iterrows():
    #print(row['text'], row['prediction'])
    mydict = { "text": row['text'], "prediction": row['prediction'] }
    mycol.insert_one(mydict)

RT @NotOwenMeany: DC Christmas Light Hunters: The area north of American University (roughly Van Ness to Davenport and 43rd to 49th streets… 1.0
RT @CryptoTownEU: 🚀 Airdrop: BAoE Global
💰 Value: 10 $BAoE
👥 Referral: 3 $BAoE
📒 Partnership: Groo INTERNATIONAL, Meta Ultra Holdings, SEM… 1.0
RT @MCU_Source: BREAKING: #Deadpool‘s first appearance at the #MCU will be in #DoctorStrangeInTheMultiverseOfMadness! https://t.co/KZYAVcAC… 1.0
@composerchris They should lose their tax exempt status 0.0
i actually want to die. 0.0
RT @emailmanROCKS: The Rules of Creative Writing by children's author Kurt Chambers https://t.co/lrQszZIoxL … This is a great guide to help… 1.0
RT @secretyarrow: how’s it lookin daddy? https://t.co/EdUjUVIuSk 1.0
RT @UKFeetToes: Bikini Body &amp; White Toes. Love those legs! https://t.co/xFYnY7AGSe 1.0
RT @djrothkopf: Manchin did not have to say anything today. Did not have to go on Fox. Did not have to issue a statement. He could've kept… 0.0
RT @itsfkntxna: you are allow

@DRiveraTX @creatureNFT @dannycoleee Hahahah my apologies but I couldn’t leave him there!! 0.0
RT @FaazNoushad: @ClippedHussar Me (who've also avoided catching COVID every wave.) https://t.co/RK3BLaBwbZ 1.0
RT @caro_painter: A little #behindthescenes of our lives  @caenhillcc 😁
Live everyday on Facebook, Youtube and TikTok updating you about th… 1.0
If you used my Social and got cash or a car and have kids I don’t feel sorry for you. Watch what happens. 0.0
RT @dobsalvatore: the way they look at eo 🔥 https://t.co/CWv4OME8h2 1.0
@sswidamh @erecalie r u 1000000% over them? or is there a small part of you that still would 0.0
@CarlonCarpenter Playing out from the back. Largely situational, can be something you do more often than other teams, High risk/high reward 1.0
Aaron breaking the record tonight 0.0
RT @chartdata: .@BTS_twt's "Permission to Dance" has now surpassed 400 million views on YouTube. 1.0
CD Media ¦ 99% In Democratic Party Referendum Vote To Oust Albanian Opposition Leader 

__Definitions__

0.0 means that the tweet was negative

1.0 means that the tweet was positive

__Retrieving Positive Tweets__

In [47]:
myquery = { "prediction": 1.0 }
mydoc = mycol.find(myquery)
for x in mydoc:
    print(x)

{'_id': ObjectId('61bfc4d9b8bc24cd819b369c'), 'text': 'RT @NotOwenMeany: DC Christmas Light Hunters: The area north of American University (roughly Van Ness to Davenport and 43rd to 49th streets…', 'prediction': 1.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b369d'), 'text': 'RT @CryptoTownEU: 🚀 Airdrop: BAoE Global\n💰 Value: 10 $BAoE\n👥 Referral: 3 $BAoE\n📒 Partnership: Groo INTERNATIONAL, Meta Ultra Holdings, SEM…', 'prediction': 1.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b369e'), 'text': 'RT @MCU_Source: BREAKING: #Deadpool‘s first appearance at the #MCU will be in #DoctorStrangeInTheMultiverseOfMadness! https://t.co/KZYAVcAC…', 'prediction': 1.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b36a1'), 'text': "RT @emailmanROCKS: The Rules of Creative Writing by children's author Kurt Chambers https://t.co/lrQszZIoxL … This is a great guide to help…", 'prediction': 1.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b36a2'), 'text': 'RT @secretyarrow: how’s it lookin daddy? https://t.co/EdUjUVIuSk', 'predict

__Retrieving Negative Tweets__

In [48]:
myquery = { "prediction": 0.0 }
mydoc = mycol.find(myquery)
for x in mydoc:
    print(x)

{'_id': ObjectId('61bfc4d9b8bc24cd819b369f'), 'text': '@composerchris They should lose their tax exempt status', 'prediction': 0.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b36a0'), 'text': 'i actually want to die.', 'prediction': 0.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b36a4'), 'text': "RT @djrothkopf: Manchin did not have to say anything today. Did not have to go on Fox. Did not have to issue a statement. He could've kept…", 'prediction': 0.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b36a5'), 'text': 'RT @itsfkntxna: you are allowed to criticize your own community and i hate to tell you this but other people are allowed to criticize your…', 'prediction': 0.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b36a8'), 'text': 'RT @ohits_laurenn: i be asking for a sign then i be ignoring the sign 😂🤦🏾\u200d♀️', 'prediction': 0.0}
{'_id': ObjectId('61bfc4d9b8bc24cd819b36af'), 'text': 'Certain actions by NFL officials, like the spot on that last Titans’ play today, make you think things might not ex… https: