__Imports__

In [1]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from google_drive_downloader import GoogleDriveDownloader as gdd
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
import nltk
from nltk.corpus import stopwords
import string
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

__Download File from link given in Canvas__

This will be stored into your local...Do not add into git, file is too large to be pushed onto git master branch

So, we download locally

In [2]:
gdd.download_file_from_google_drive(file_id='0B04GJPshIjmPRnZManQwWEdTZjg',
                                    dest_path='/Users/mwoo/Downloads/trainingandtestdata.zip',
                                    unzip=True)

In [3]:
# gdd.download_file_from_google_drive(file_id='0B04GJPshIjmPRnZManQwWEdTZjg',
#                                     dest_path='/Users/swapnilbasu/Downloads/trainingandtestdata.zip',
#                                     unzip=True)

__Create spark session object (Data Processing)__

In [4]:
spark=SparkSession.builder.appName('classification_tweet').getOrCreate()

__Load in data__

In [5]:
training_data = spark.read.csv("/Users/mwoo/Downloads/training.1600000.processed.noemoticon.csv",header=False)

__Renaming columns__

In [6]:
training_data.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5']

In [7]:
training_data = training_data.toDF("target",'id','date','query','user_name','text')

In [8]:
training_data.columns

['target', 'id', 'date', 'query', 'user_name', 'text']

__Exploratory__

In [9]:
training_data.describe()

DataFrame[summary: string, target: string, id: string, date: string, query: string, user_name: string, text: string]

__Selecting the target value and text__

In [10]:
df = training_data.select('text','target')

In [11]:
df.show(5)

+--------------------+------+
|                text|target|
+--------------------+------+
|@switchfoot http:...|     0|
|is upset that he ...|     0|
|@Kenichan I dived...|     0|
|my whole body fee...|     0|
|@nationwideclass ...|     0|
+--------------------+------+
only showing top 5 rows



In [12]:
df.printSchema()

root
 |-- text: string (nullable = true)
 |-- target: string (nullable = true)



We can see below that its an even split between positive and negative tweets

0: negative
4: positive

In [13]:
df.groupBy("target").count().orderBy(col("count").desc()).show()

+------+------+
|target| count|
+------+------+
|     0|800000|
|     4|800000|
+------+------+



__Model Pipeline__

__Regular Expression Tokenizer__

Seperates the texts into words

In [14]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

__Stop Words Download from NLTK__

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/mwoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

__Stop Words Remover__

Remove unnecessary words

In [16]:
sp = set(string.punctuation)
stop_words = set(stopwords.words('english'))
extra_words = {"http","https","amp","rt","t","c","the"}
for i in extra_words:
    stop_words.add(i) 
stop_words = list(stop_words)

In [17]:
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)

__Bag of words count__

This is a type of feature engineering

In [18]:
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

__StringIndexer__

Indexing target values

In [19]:
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")

In [20]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(5)

+--------------------+------+--------------------+--------------------+--------------------+-----+
|                text|target|               words|            filtered|            features|label|
+--------------------+------+--------------------+--------------------+--------------------+-----+
|@switchfoot http:...|     0|[switchfoot, http...|[switchfoot, twit...|(10000,[1,10,16,6...|  0.0|
|is upset that he ...|     0|[is, upset, that,...|[upset, update, f...|(10000,[6,70,172,...|  0.0|
|@Kenichan I dived...|     0|[kenichan, i, div...|[kenichan, dived,...|(10000,[4,213,251...|  0.0|
|my whole body fee...|     0|[my, whole, body,...|[whole, body, fee...|(10000,[3,325,374...|  0.0|
|@nationwideclass ...|     0|[nationwideclass,...|[nationwideclass,...|(10000,[20,486],[...|  0.0|
+--------------------+------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



__Selecting data from the previous dataframe__

In [21]:
dataset = dataset.select('text','features','label')

__Set seed for reproducibility__

This set is use for testing purposes 70/30 split

In [22]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 1120280
Test Dataset Count: 479720


This will be used to fully train our classifcation model

In [23]:
model_df = dataset.select('features','label')
print("Full Training Dataset Count: " + str(model_df.count()))

Full Training Dataset Count: 1600000


__Testing our model through the split data above__

In [24]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0).fit(trainingData)
predictions = lr.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
predictions = lr.transform(testData)
predictions.show(10)

+------------------------------+------------------------------+-----+----------+
|                          text|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|@KoolioHoolio see i didnt e...|[0.9983510746371353,0.00164...|  1.0|       0.0|
|you suck you suck you suck ...|[0.9956434658325106,0.00435...|  0.0|       0.0|
|super pissed that another t...|[0.9952914880302268,0.00470...|  0.0|       0.0|
|Things I'm feeling now: ang...|[0.9942653689730412,0.00573...|  0.0|       0.0|
|so sad, me equal sad, no so...|[0.9926124136068198,0.00738...|  0.0|       0.0|
|is feeling sad and stressed...|[0.9921178408523211,0.00788...|  0.0|       0.0|
|today i kinda feel sick of ...|[0.9918232711532882,0.00817...|  0.0|       0.0|
|Been sick with sore throat ...|[0.9901000573760685,0.00989...|  0.0|       0.0|
|Throat is killing me, runny...|[0.9884492105393018,0.01155...|  0.0|       0.0|
|Ugh my nose is stuffy, my t

In [25]:
predictions.printSchema()

root
 |-- text: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bce = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(bce.evaluate(predictions, {bce.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.8471903651669457


In [27]:
df_selection = predictions.select("label",'prediction').toPandas()

In [28]:
from sklearn.metrics import classification_report
true = np.array(df_selection['label'])
pred = np.array(df_selection['prediction'])
print(classification_report(true,pred))

              precision    recall  f1-score   support

         0.0       0.79      0.75      0.77    239942
         1.0       0.76      0.80      0.78    239778

    accuracy                           0.77    479720
   macro avg       0.77      0.77      0.77    479720
weighted avg       0.77      0.77      0.77    479720



__We can say that this model is able to distinguish whether a tweet is positive or negative is fair__

__Retrain our model using the fully dataset__

In [29]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0).fit(model_df)

__Twitter Authentication__

In [30]:
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream

ACCESS_TOKEN = "1458842253779161088-QFeO6udaAdHR4VARxaDza1w4LUlooE"
ACCESS_TOKEN_SECRET = "tC7IJDbl5T97Zvu3kE8sdGnmZWC2qxOrkdOv90YkdzIVO"
API_KEY = "KLP5ct26qaVo0KjAgP8O4j4y5"
API_KEY_SECRET = "AbxH3913WIPG0FHIwvVRomul92RWvuOdxRo2ecXR6H0Qgibo29"

auth = tweepy.OAuthHandler(API_KEY, API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = tweepy.API(auth)

__Twitter Streaming Tweets__

We can set the limit of tweet samples to 10 and where the streaming only streams tweets that are in english

In [31]:
tweet_list = list()
# Subclass Stream to print IDs of Tweets received
class IDPrinter(tweepy.Stream):
    
    def on_status(self, status):
        tweet_list.append(status.text)
        #print(tweet_list)
        #print(status.text)
        if len(tweet_list) == 100:
            Stream.disconnect(self)
# Initialize instance of the subclass
printer = IDPrinter(
  API_KEY, API_KEY_SECRET,
  ACCESS_TOKEN, ACCESS_TOKEN_SECRET
)

printer.sample(languages=['en'])

Stream connection closed by Twitter


__Create new dataframe from tweet stream__

In [32]:
df_2 = pd.DataFrame(np.array(tweet_list))
df_2.columns = ['text']
df_2 = spark.createDataFrame(df_2)
df_2.show()

+--------------------+
|                text|
+--------------------+
|RT @JesuitTigers_...|
|Disabled Vehicle:...|
|RT @NCTsmtown: SM...|
|RT @keyon: BEST S...|
|    @oha_yanii evett|
|RT @emilyeveryep:...|
|RT @grimnorth_0: ...|
|@EssexPR That sai...|
|RT @jenoverse423:...|
|RT @aishlut: snea...|
|RT @chitaglorya__...|
|         every look.|
|cross solar cooke...|
|RT @RRRMovie: Fin...|
|Mis outfits han e...|
|RT @cryptogems555...|
|RT @AutomizedCock...|
|RT @shootsgard: P...|
|@TTLadyLuscious L...|
|RT @Pratikfc7: Ne...|
+--------------------+
only showing top 20 rows



__Transforms the data received from stream__

In [33]:
dataset_1 = pipelineFit.transform(df_2)
dataset_1.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|               words|            filtered|            features|
+--------------------+--------------------+--------------------+--------------------+
|RT @JesuitTigers_...|[rt, jesuittigers...|[jesuittigers_fb,...|(10000,[511,651,1...|
|Disabled Vehicle:...|[disabled, vehicl...|[disabled, vehicl...|(10000,[23,452,51...|
|RT @NCTsmtown: SM...|[rt, nctsmtown, s...|[nctsmtown, smtow...|(10000,[868,3189]...|
|RT @keyon: BEST S...|[rt, keyon, best,...|[keyon, best, spi...|(10000,[94,118,13...|
|    @oha_yanii evett|  [oha_yanii, evett]|  [oha_yanii, evett]|       (10000,[],[])|
|RT @emilyeveryep:...|[rt, emilyeveryep...|[emilyeveryep, 5x...|(10000,[3,10,42,3...|
|RT @grimnorth_0: ...|[rt, grimnorth_0,...|[grimnorth_0, bma...|(10000,[23,32,103...|
|@EssexPR That sai...|[essexpr, that, s...|[essexpr, said, w...|(10000,[6,24,127,...|
|RT @jenoverse423:...|[rt, jenoverse423...|[jenoverse4

__Selecting text and Features__

In [34]:
dataset_test = dataset_1.select('text','features')

__Having the pre-trained model predict on tweets__

In [37]:
model_predictions = lr.transform(dataset_test)

In [38]:
model_predictions.show()

+--------------------+--------------------+--------------------+--------------------+----------+
|                text|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------+
|RT @JesuitTigers_...|(10000,[511,651,1...|[-0.5318821511759...|[0.37007801323871...|       1.0|
|Disabled Vehicle:...|(10000,[23,452,51...|[0.94578290396596...|[0.72026629565034...|       0.0|
|RT @NCTsmtown: SM...|(10000,[868,3189]...|[-0.1436763568720...|[0.46414257299695...|       1.0|
|RT @keyon: BEST S...|(10000,[94,118,13...|[0.05274802373889...|[0.51318394920864...|       0.0|
|    @oha_yanii evett|       (10000,[],[])|[-0.0928899453666...|[0.47679419727941...|       1.0|
|RT @emilyeveryep:...|(10000,[3,10,42,3...|[0.40655781964782...|[0.60026222209011...|       0.0|
|RT @grimnorth_0: ...|(10000,[23,32,103...|[1.02569250257777...|[0.73607994580437...|       0.0|
|@EssexPR That sai...|(10000,[