__Imports__

In [3]:
pip install googledrivedownloader

Collecting googledrivedownloader
  Downloading googledrivedownloader-0.4-py2.py3-none-any.whl (3.9 kB)
Installing collected packages: googledrivedownloader
Successfully installed googledrivedownloader-0.4
Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from google_drive_downloader import GoogleDriveDownloader as gdd

__Download File from link given in Canvas__

This will be stored into your local...Do not add into git, file is too large to be pushed onto git master branch

So, we download locally

In [6]:
gdd.download_file_from_google_drive(file_id='0B04GJPshIjmPRnZManQwWEdTZjg',
                                    dest_path='/Users/swapnilbasu/Downloads/trainingandtestdata.zip',
                                    unzip=True)

__Create spark session object (Data Processing)__

In [7]:
spark=SparkSession.builder.appName('data_processing').getOrCreate()

__Load in data__

In [10]:
training_data = spark.read.csv("/Users/swapnilbasu/Downloads/trainingandtestdata/training.1600000.processed.noemoticon.csv",header=False)

__Renaming columns__

In [11]:
training_data.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5']

In [12]:
training_data = training_data.toDF("target",'id','date','query','user_name','text')

In [13]:
training_data.columns

['target', 'id', 'date', 'query', 'user_name', 'text']

__Selecting the target value and text__

In [14]:
df = training_data.select('text','target')

In [15]:
df.show(5)

+--------------------+------+
|                text|target|
+--------------------+------+
|@switchfoot http:...|     0|
|is upset that he ...|     0|
|@Kenichan I dived...|     0|
|my whole body fee...|     0|
|@nationwideclass ...|     0|
+--------------------+------+
only showing top 5 rows



In [16]:
df.printSchema()

root
 |-- text: string (nullable = true)
 |-- target: string (nullable = true)



We can see below that its an even split between positive and negative tweets

0: negative
4: positive

In [17]:
from pyspark.sql.functions import col
df.groupBy("target").count().orderBy(col("count").desc()).show()

+------+------+
|target| count|
+------+------+
|     0|800000|
|     4|800000|
+------+------+



__Model Pipeline__

In [18]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

__Regular Expression Tokenizer__

In [19]:
regexTokenizer = RegexTokenizer(inputCol="text", 
                                outputCol="words", 
                                pattern="\\W")

__Stop Words Download from NLTK__

In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/swapnilbasu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

__Stop Words Remover__

In [21]:
from nltk.corpus import stopwords
import string
sp = set(string.punctuation)
stop_words = set(stopwords.words('english'))
extra_words = {"http","https","amp","rt","t","c","the"}
for i in extra_words:
    stop_words.add(i) 
stop_words = list(stop_words)
stopwordsRemover = StopWordsRemover(inputCol="words", 
                                    outputCol="filtered").setStopWords(stop_words)

__Bag of words count__

This is a type of feature engineering

In [22]:
countVectors = CountVectorizer(inputCol="filtered", 
                               outputCol="features", 
                               vocabSize=10000, minDF=5)

__StringIndexer__

This is where we create our new dataframe in spark

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(5)

+--------------------+------+--------------------+--------------------+--------------------+-----+
|                text|target|               words|            filtered|            features|label|
+--------------------+------+--------------------+--------------------+--------------------+-----+
|@switchfoot http:...|     0|[switchfoot, http...|[switchfoot, twit...|(10000,[1,10,16,6...|  0.0|
|is upset that he ...|     0|[is, upset, that,...|[upset, update, f...|(10000,[6,70,172,...|  0.0|
|@Kenichan I dived...|     0|[kenichan, i, div...|[kenichan, dived,...|(10000,[4,213,251...|  0.0|
|my whole body fee...|     0|[my, whole, body,...|[whole, body, fee...|(10000,[3,325,374...|  0.0|
|@nationwideclass ...|     0|[nationwideclass,...|[nationwideclass,...|(10000,[20,486],[...|  0.0|
+--------------------+------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



__Set seed for reproducibility__

In [24]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 1119737
Test Dataset Count: 480263


In [25]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                          text|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|sad sad sad sad sad sad sad...|[0.9987618635665929,0.00123...|  0.0|       0.0|
|Slept bad, feel tired. Hate...|[0.9957913419884288,0.00420...|  0.0|       0.0|
|super pissed that another t...|[0.9953102696313872,0.00468...|  0.0|       0.0|
|I'm sooooo sick...sore thro...|[0.9933885939735362,0.00661...|  0.0|       0.0|
|high temperature, runny nos...|[0.9929344107087643,0.00706...|  0.0|       0.0|
|really bored and the pain k...|[0.990433624058968,0.009566...|  0.0|       0.0|
|#trackle #trackle #trackle ...|[0.9893048053840303,0.01069...|  0.0|       0.0|
|ugh I feel like crap. Heada...|[0.9890760738152531,0.01092...|  0.0|       0.0|
|So yeah. I feel like shit. ...|[0.9880498997719068,0.01195...|  0.0|       0.0|
|Im sick today yuck, i hate 

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7706803149123818

In [5]:
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream

ACCESS_TOKEN = "1458842253779161088-QFeO6udaAdHR4VARxaDza1w4LUlooE"
ACCESS_TOKEN_SECRET = "tC7IJDbl5T97Zvu3kE8sdGnmZWC2qxOrkdOv90YkdzIVO"
API_KEY = "KLP5ct26qaVo0KjAgP8O4j4y5"
API_KEY_SECRET = "AbxH3913WIPG0FHIwvVRomul92RWvuOdxRo2ecXR6H0Qgibo29"

auth = tweepy.OAuthHandler(API_KEY, API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = tweepy.API(auth)

public_tweets = api.home_timeline()
for tweet in public_tweets:
    print(tweet.text)
    
user = api.get_user(screen_name='twitter')
print(user.screen_name)
print(user.followers_count)
for friend in user.friends():
   print(friend.screen_name)


Twitter is a book you'll never finish
to everyone procrastinating, you've come to the right place
cheers to all the Tweets that made it to IG this year
your typo makes it original
you're doing great, even if your Tweets aren't
RT @TwitterBlue: It’s time to flex those Twitter fingers and take it to the next level 💪

Twitter Blue is now available for subscription in…
BIG NEWS lol jk still Twitter
RT @TwitterSpaces: the time has arrived -- we’re now rolling out the ability for everyone on iOS and Android to host a Space

if this is yo…
“I’m not on Twitter” 🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩🚩
the timeline is in retrograde
RT @tmhsaysoo: We don’t beg for followers on twitter. You get em when you get em 😭😭😭😭
hello literally everyone
apparently it's october
oh you also love money? here’s how to send and receive Tips https://t.co/tCnzgrJEGE
tested the Tips feature, turns out people love money

rolling out on iOS with Android coming soon https://t.co/pkmLHzg6fu
well, well, well
ok 

In [10]:
# Subclass Stream to print IDs of Tweets received
class IDPrinter(tweepy.Stream):

    def on_status(self, status):
        print(status.text)

# Initialize instance of the subclass
printer = IDPrinter(
  API_KEY, API_KEY_SECRET,
  ACCESS_TOKEN, ACCESS_TOKEN_SECRET
)

# Filter realtime Tweets by keyword
printer.filter(track=["Spiderman"])

RT @antroshouse: #AntrosShower my SEX home
putos safados e leiteiros estão convocados! 
DIA 17/12/21...
ás 15:30H até 22h
LOCAL SAUNA SHOWE…
quiero un spiderman 🥲
#SpiderMan #CheEve #NCT127_NEOCITY_THE_LINK
#ENHYPEN_GayoDaechukje2021 #JISOO #GreysAnatomy #unprofessore #IndVsPak… https://t.co/42r68SbOYv
RT @Cena_The_Kepar: I some leak of spiderman no way home. #SpiderManNoWayHome 

RT+LIKE if you want it, i will send you a link in PM. 

Che…
RT @Blacksheepash1: #SpiderManNoWayHome #spiderman
When I finally see spiderman no way home and I can enter Twitter without worrying about…
sampe nonton spiderman masih aja nontonnya sendiri https://t.co/ZgoWJiTnr3
Ça me tue ma sœur elle a rattrapé les Spiderman qu’elle avait pas vue + Venom pour être à jour parce que je lui ai… https://t.co/KQS3NbbKtT
he visto Spiderman y he comido sushi no puedo ser más feliz
RT @0ievelynn: Eu entrando no Twitter depois de ter assistido o filme sem medo de tomar spoiler 

#SpiderManNoWayHome #SpiderMan 
 https://…

KeyboardInterrupt: 