__Imports__

In [1]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

In [1]:
file_id = '0B04GJPshIjmPRnZManQwWEdTZjg'
request = drive_service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print "Download %d%%." % int(status.progress() * 100)

__Create spark session object__

In [2]:
spark=SparkSession.builder.appName('data_processing').getOrCreate()

__Load in data__

In [3]:
training_data = spark.read.csv("trainingandtestdata/training.1600000.processed.noemoticon.csv",header=False)

__Renaming columns__

In [4]:
training_data.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5']

In [5]:
training_data = training_data.toDF("target",'id','date','query','user_name','text')

In [6]:
training_data.columns

['target', 'id', 'date', 'query', 'user_name', 'text']

__Selecting the target value and text__

In [7]:
df = training_data.select('text','target')

In [8]:
df.show(5)

+--------------------+------+
|                text|target|
+--------------------+------+
|@switchfoot http:...|     0|
|is upset that he ...|     0|
|@Kenichan I dived...|     0|
|my whole body fee...|     0|
|@nationwideclass ...|     0|
+--------------------+------+
only showing top 5 rows



In [9]:
df.printSchema()

root
 |-- text: string (nullable = true)
 |-- target: string (nullable = true)



We can see below that its an even split between positive and negative tweets

0: negative
4: positive

In [10]:
from pyspark.sql.functions import col
df.groupBy("target").count().orderBy(col("count").desc()).show()

+------+------+
|target| count|
+------+------+
|     0|800000|
|     4|800000|
+------+------+



__Model Pipeline__

In [11]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

__Regular Expression Tokenizer__

In [12]:
regexTokenizer = RegexTokenizer(inputCol="text", 
                                outputCol="words", 
                                pattern="\\W")

__Stop Words Download from NLTK__

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/mwoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

__Stop Words Remover__

In [14]:
from nltk.corpus import stopwords
import string
sp = set(string.punctuation)
stop_words = set(stopwords.words('english'))
extra_words = {"http","https","amp","rt","t","c","the"}
for i in extra_words:
    stop_words.add(i) 
stop_words = list(stop_words)
stopwordsRemover = StopWordsRemover(inputCol="words", 
                                    outputCol="filtered").setStopWords(stop_words)

__Bag of words count__

This is a type of feature engineering

In [15]:
countVectors = CountVectorizer(inputCol="filtered", 
                               outputCol="features", 
                               vocabSize=10000, minDF=5)

__StringIndexer__

This is where we create our new dataframe in spark

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(5)

+--------------------+------+--------------------+--------------------+--------------------+-----+
|                text|target|               words|            filtered|            features|label|
+--------------------+------+--------------------+--------------------+--------------------+-----+
|@switchfoot http:...|     0|[switchfoot, http...|[switchfoot, twit...|(10000,[1,10,16,6...|  0.0|
|is upset that he ...|     0|[is, upset, that,...|[upset, update, f...|(10000,[6,70,172,...|  0.0|
|@Kenichan I dived...|     0|[kenichan, i, div...|[kenichan, dived,...|(10000,[4,213,251...|  0.0|
|my whole body fee...|     0|[my, whole, body,...|[whole, body, fee...|(10000,[3,325,374...|  0.0|
|@nationwideclass ...|     0|[nationwideclass,...|[nationwideclass,...|(10000,[20,486],[...|  0.0|
+--------------------+------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



__Set seed for reproducibility__

In [17]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 1120280
Test Dataset Count: 479720


In [18]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                          text|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|@KoolioHoolio see i didnt e...|[0.9983518594003358,0.00164...|  1.0|       0.0|
|you suck you suck you suck ...|[0.995651770242575,0.004348...|  0.0|       0.0|
|super pissed that another t...|[0.9952922252477356,0.00470...|  0.0|       0.0|
|Things I'm feeling now: ang...|[0.9942601142013339,0.00573...|  0.0|       0.0|
|so sad, me equal sad, no so...|[0.9926125388390703,0.00738...|  0.0|       0.0|
|is feeling sad and stressed...|[0.9921214853611635,0.00787...|  0.0|       0.0|
|today i kinda feel sick of ...|[0.9918284712058718,0.00817...|  0.0|       0.0|
|Been sick with sore throat ...|[0.9900999703460126,0.00990...|  0.0|       0.0|
|Throat is killing me, runny...|[0.9884506446843292,0.01154...|  0.0|       0.0|
|Ugh my nose is stuffy, my t

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7717407096715371