In [45]:
# import pyspark
# from pyspark.sql import SQLContext
# 
# # create spark contexts
# sc = pyspark.SparkContext()
# sqlContext = SQLContext(sc)

## Neccessary modules and python files
* **langid**
* **nltk**

### Install **langid**

```
pip install langid
```

### Download nltk

* install **nltk** module
```
pip install nltk
```

* download corpora
```
# enter python interactive environment
python
# type python script
from ntlk import download
download()
```


### Get `preproc.py`
* Reference: https://github.com/dreyco676/nlp_spark
* Get `preproc.py`: `wget https://raw.githubusercontent.com/dreyco676/nlp_spark/master/preproc.py`
* `preproc.py` has to be in the same directory with your *ipynb* file

### Get practice data

```
git clone https://github.com/dreyco676/nlp_spark.git
cd nlp_spark/
unzip data.zip
```

# User defined functions

In [46]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import preproc as pp
# Register all the functions in Preproc with Spark Context
check_lang_udf = udf(pp.check_lang, StringType())
remove_stops_udf = udf(pp.remove_stops, StringType())
remove_features_udf = udf(pp.remove_features, StringType())
tag_and_remove_udf = udf(pp.tag_and_remove, StringType())
lemmatize_udf = udf(pp.lemmatize, StringType())
check_blanks_udf = udf(pp.check_blanks, StringType())

# Data Preprocessing

### Load data

In [47]:
raw_classified = spark.read.csv('nlp_spark/data/raw_classified.txt', inferSchema=True, sep='\t').toDF('text', 'id', 'label')

In [48]:
raw_classified.show(5)

+--------------------+------------------+-----+
|                text|                id|label|
+--------------------+------------------+-----+
|Fresh install of ...|        1018769417|  1.0|
|Well. Now I know ...|       10284216536|  1.0|
|"Literally six we...|       10298589026|  1.0|
|Mitsubishi i MiEV...|109017669432377344|  1.0|
|'Cheap Eats in SL...|109642968603963392|  1.0|
+--------------------+------------------+-----+
only showing top 5 rows



### Remove single/double quotes and space at the begining and end of string

In [80]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
trim_quotes_and_space_udf = udf(lambda s: s.strip().strip('"').strip("'"), StringType())
raw_classified_v1 = raw_classified.withColumn('text', trim_quotes_and_space_udf(raw_classified.text))

In [81]:
raw_classified_v1.show(5)
print('Total rows: {}'.format(raw_classified_v1.count()))

+--------------------+------------------+-----+
|                text|                id|label|
+--------------------+------------------+-----+
|Fresh install of ...|        1018769417|  1.0|
|Well. Now I know ...|       10284216536|  1.0|
|Literally six wee...|       10298589026|  1.0|
|Mitsubishi i MiEV...|109017669432377344|  1.0|
|Cheap Eats in SLP...|109642968603963392|  1.0|
+--------------------+------------------+-----+
only showing top 5 rows

Total rows: 115886


In [82]:
raw_classified_v1.select('text').toPandas()['text'][4]

u"Cheap Eats in SLP' - http://t.co/4w8gRp7"

### Check string length in column 'text'

In [89]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
string_length_udf = udf(lambda s: len(s), IntegerType())
raw_classified_v2 = raw_classified_v1.withColumn('text_length', string_length_udf(raw_classified_v1.text))

In [93]:
raw_classified_v2.orderBy('text_length').show(5)
raw_classified_v2.orderBy('text_length', ascending=False).show(5)

+----------+------------------+-----+-----------+
|      text|                id|label|text_length|
+----------+------------------+-----+-----------+
|   awesome|         882098800|  1.0|          7|
|  City Wok|665255511273570305|  1.0|          8|
| #iknowhow|537734337655889921|  1.0|          9|
|It's cold.|159983261911760896|  1.0|         10|
|Boarded :)|413084478760685568|  1.0|         10|
+----------+------------------+-----+-----------+
only showing top 5 rows

+--------------------+------------------+-----+-----------+
|                text|                id|label|text_length|
+--------------------+------------------+-----+-----------+
|A girlfriend that...|424763466897846272|  1.0|        256|
|Real boyfriends &...|429215572241285121|  1.0|        166|
|RT @julieklausner...|199565254542376960|  1.0|        164|
|Real boyfriends &...|418449856298889216|  1.0|        164|
|Damn. I want to s...|445063125696401409|  1.0|        159|
+--------------------+------------------+----

* The minimum length is 7 and maximum length is 256. Therefore, no empty strings or None values in column 'text'

In [94]:
raw_classified_v2.show(5)

+--------------------+------------------+-----+-----------+
|                text|                id|label|text_length|
+--------------------+------------------+-----+-----------+
|Fresh install of ...|        1018769417|  1.0|         61|
|Well. Now I know ...|       10284216536|  1.0|         85|
|Literally six wee...|       10298589026|  1.0|        134|
|Mitsubishi i MiEV...|109017669432377344|  1.0|         90|
|Cheap Eats in SLP...|109642968603963392|  1.0|         40|
+--------------------+------------------+-----+-----------+
only showing top 5 rows



* How many labels are there?

In [95]:
raw_classified.select('label').distinct().show()

+-----+
|label|
+-----+
|  0.0|
|  1.0|
+-----+



* Check values in column 'id'
    + From the id sorted results, no NA, None values exist in column 'id'

In [96]:
raw_classified_v2.orderBy('id').show(5)
raw_classified_v2.orderBy('id', ascending=False).show(5)

+--------------------+---------+-----+-----------+
|                text|       id|label|text_length|
+--------------------+---------+-----+-----------+
|Sorry! Account de...|797858706|  1.0|         59|
|Yo am I imagining...|798243247|  1.0|        127|
|Midnight coffee i...|798474877|  1.0|         48|
|I'm sad that Mike...|799151574|  1.0|         45|
|      Peter fixed it|799331338|  1.0|         14|
+--------------------+---------+-----+-----------+
only showing top 5 rows

+--------------------+------------------+-----+-----------+
|                text|                id|label|text_length|
+--------------------+------------------+-----+-----------+
|White Dwarf 100 "...|679856481798369282|  1.0|        123|
|White Dwarf 100 "...|679856481798369282|  1.0|        123|
|White Dwarf 100 �...|679851755815985153|  1.0|        135|
|RT @iwan0www: Pro...|679847620995579904|  1.0|        122|
|RT @mikeolson: 9 ...|679847263238250497|  0.0|        140|
+--------------------+--------------

### Identify language

In [105]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from langid import classify
classify('this is a text, 中文')

('la', -101.41858577728271)

In [103]:
help(classify)

Help on function classify in module langid.langid:

classify(instance)
    Convenience method using a global identifier instance with the default
    model included in langid.py. Identifies the language that a string is 
    written in.
    
    @param instance a text string. Unicode strings will automatically be utf8-encoded
    @returns a tuple of the most likely language and the confidence score



In [98]:
raw_classified_v2.show(5)

+--------------------+------------------+-----+-----------+
|                text|                id|label|text_length|
+--------------------+------------------+-----+-----------+
|Fresh install of ...|        1018769417|  1.0|         61|
|Well. Now I know ...|       10284216536|  1.0|         85|
|Literally six wee...|       10298589026|  1.0|        134|
|Mitsubishi i MiEV...|109017669432377344|  1.0|         90|
|Cheap Eats in SLP...|109642968603963392|  1.0|         40|
+--------------------+------------------+-----+-----------+
only showing top 5 rows



In [5]:
# predict language and filter out those with less than 90% chance of being English
lang_df = data_df.withColumn("lang", check_lang_udf(data_df["text"]))
en_df = lang_df.filter(lang_df["lang"] == "en")

In [6]:
en_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- label: double (nullable = true)
 |-- lang: string (nullable = true)



In [7]:
en_df.show(4)

+--------------------+------------------+-----+----+
|                text|                id|label|lang|
+--------------------+------------------+-----+----+
|RT @goeentertain:...|665305154954989568|  1.0|  en|
|Teforia Uses Mach...|660668007975268352|  1.0|  en|
|   Apple TV or Roku?|       25842461136|  1.0|  en|
|Finished http://t...|        9412369614|  1.0|  en|
+--------------------+------------------+-----+----+
only showing top 4 rows



In [8]:
# remove stop words to reduce dimensionality
rm_stops_df = en_df.withColumn("stop_text", remove_stops_udf(en_df["text"]))

In [9]:
rm_stops_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- label: double (nullable = true)
 |-- lang: string (nullable = true)
 |-- stop_text: string (nullable = true)



In [10]:
rm_stops_df.show(4)

+--------------------+------------------+-----+----+--------------------+
|                text|                id|label|lang|           stop_text|
+--------------------+------------------+-----+----+--------------------+
|RT @goeentertain:...|665305154954989568|  1.0|  en|RT @goeentertain:...|
|Teforia Uses Mach...|660668007975268352|  1.0|  en|Teforia Uses Mach...|
|   Apple TV or Roku?|       25842461136|  1.0|  en|      Apple TV Roku?|
|Finished http://t...|        9412369614|  1.0|  en|Finished http://t...|
+--------------------+------------------+-----+----+--------------------+
only showing top 4 rows



In [11]:
# remove other non essential words, think of it as my personal stop word list
rm_features_df = rm_stops_df.withColumn("feat_text", \
                                        remove_features_udf(rm_stops_df["stop_text"]))

In [12]:
rm_features_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- label: double (nullable = true)
 |-- lang: string (nullable = true)
 |-- stop_text: string (nullable = true)
 |-- feat_text: string (nullable = true)



In [13]:
rm_features_df.show(4)

+--------------------+------------------+-----+----+--------------------+--------------------+
|                text|                id|label|lang|           stop_text|           feat_text|
+--------------------+------------------+-----+----+--------------------+--------------------+
|RT @goeentertain:...|665305154954989568|  1.0|  en|RT @goeentertain:...|  future blase   ...|
|Teforia Uses Mach...|660668007975268352|  1.0|  en|Teforia Uses Mach...|teforia uses mach...|
|   Apple TV or Roku?|       25842461136|  1.0|  en|      Apple TV Roku?|         apple  roku|
|Finished http://t...|        9412369614|  1.0|  en|Finished http://t...|            finished|
+--------------------+------------------+-----+----+--------------------+--------------------+
only showing top 4 rows



In [14]:
# tag the words remaining and keep only Nouns, Verbs and Adjectives
tagged_df = rm_features_df.withColumn("tagged_text", \
                                      tag_and_remove_udf(rm_features_df.feat_text))

In [15]:
tagged_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- label: double (nullable = true)
 |-- lang: string (nullable = true)
 |-- stop_text: string (nullable = true)
 |-- feat_text: string (nullable = true)
 |-- tagged_text: string (nullable = true)



In [16]:
tagged_df.show(4)

+--------------------+------------------+-----+----+--------------------+--------------------+--------------------+
|                text|                id|label|lang|           stop_text|           feat_text|         tagged_text|
+--------------------+------------------+-----+----+--------------------+--------------------+--------------------+
|RT @goeentertain:...|665305154954989568|  1.0|  en|RT @goeentertain:...|  future blase   ...| future blase vic...|
|Teforia Uses Mach...|660668007975268352|  1.0|  en|Teforia Uses Mach...|teforia uses mach...| teforia uses mac...|
|   Apple TV or Roku?|       25842461136|  1.0|  en|      Apple TV Roku?|         apple  roku|         apple roku |
|Finished http://t...|        9412369614|  1.0|  en|Finished http://t...|            finished|           finished |
+--------------------+------------------+-----+----+--------------------+--------------------+--------------------+
only showing top 4 rows



In [17]:
# lemmatization of remaining words to reduce dimensionality & boost measures
lemm_df = tagged_df.withColumn("lemm_text", lemmatize_udf(tagged_df["tagged_text"]))

In [18]:
# lemmatization of remaining words to reduce dimensionality & boost measures
lemm_df = tagged_df.withColumn("lemm_text", lemmatize_udf(tagged_df["tagged_text"]))

In [19]:
# remove all rows containing only blank spaces
check_blanks_df = lemm_df.withColumn("is_blank", check_blanks_udf(lemm_df["lemm_text"]))
no_blanks_df = check_blanks_df.filter(check_blanks_df["is_blank"] == "False")
no_blanks_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- id: string (nullable = true)
 |-- label: double (nullable = true)
 |-- lang: string (nullable = true)
 |-- stop_text: string (nullable = true)
 |-- feat_text: string (nullable = true)
 |-- tagged_text: string (nullable = true)
 |-- lemm_text: string (nullable = true)
 |-- is_blank: string (nullable = true)



In [20]:
# rename columns
no_blanks_df = no_blanks_df.withColumn("text",no_blanks_df.lemm_text)

In [21]:
# dedupe important since alot of the tweets only differed by url's and RT mentions
dedup_df = no_blanks_df.dropDuplicates(['text', 'label'])

In [22]:
# select only the columns we care about
data_set = dedup_df.select('id', 'text','label')

In [23]:
data_set.show(4)

+------------------+--------------------+-----+
|                id|                text|label|
+------------------+--------------------+-----+
|        1546813742|              dragon|  1.0|
|        1558492525|           hurt much|  1.0|
|383221484023709697|seth blog word se...|  1.0|
|660668007975268352|teforia use machi...|  1.0|
+------------------+--------------------+-----+
only showing top 4 rows



In [24]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data_set.randomSplit([0.6, 0.4])

In [25]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier 
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer

In [26]:
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
idf = IDF(minDocFreq=3, inputCol="features", outputCol="idf")

In [27]:
# 
nb = NaiveBayes()

In [28]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])

In [29]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [30]:
# Make predictions.
predictions = model.transform(testData)

In [31]:
# Select example rows to display.
predictions.select("text", "label", "prediction").show(5)

+--------------------+-----+----------+
|                text|label|prediction|
+--------------------+-----+----------+
|           hurt much|  1.0|       1.0|
|teforia use machi...|  1.0|       1.0|
|              finish|  1.0|       1.0|
|future blase vice...|  1.0|       1.0|
|              divine|  1.0|       1.0|
+--------------------+-----+----------+
only showing top 5 rows



In [32]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.912655971479501

### Cross Validation

In [33]:
#paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()
# paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,[4,8,10]).\
#                     addGrid(rf.impurity, ['entropy','gini']).build()


# cv = CrossValidator(estimator=pipeline, 
#                     estimatorParamMaps=paramGrid, 
#                     evaluator=MulticlassClassificationEvaluator(), 
#                     numFolds=4)
                    

# #training_df.show(5)  
# cvModel = cv.fit(training_df)

In [34]:
#prediction = cvModel.transform(test_df)