In [2]:
import pandas as pd
import numpy as np
import nltk
import string



# Spark Environment
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pyspark

number_cores = 4
memory_gb = 16
conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = pyspark.SparkContext.getOrCreate(conf=conf)
print(sc)

<SparkContext master=local[4] appName=pyspark-shell>


In [3]:
# get the context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
print(spark) 

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

<pyspark.sql.session.SparkSession object at 0x7f7cb4357b70>


In [4]:
# Download files
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
!pip install langid

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import preproc as pp

# Register all the functions in Preproc with Spark Context
check_lang_udf = udf(pp.check_lang, StringType())
remove_stops_udf = udf(pp.remove_stops, StringType())
remove_features_udf = udf(pp.remove_features, StringType())
tag_and_remove_udf = udf(pp.tag_and_remove, StringType())
lemmatize_udf = udf(pp.lemmatize, StringType())
check_blanks_udf = udf(pp.check_blanks, StringType())

[nltk_data] Downloading package punkt to /home/faculty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/faculty/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [39]:
from pyspark.sql.types import IntegerType

# Read the data (Spark)
#review_df = sqlContext.read.csv('/project/weaklabel_sampledata.csv', header=True).limit(5000)
# TEST
review_df = sqlContext.read.csv('/project/development_split.csv', header=True)

review_df = review_df.withColumnRenamed('Review','text')
review_df = review_df.withColumnRenamed('Freshness','label')
review_df = review_df.withColumn("label", review_df["label"].cast(IntegerType()))
review_df = review_df.filter(review_df.label. isNotNull())
review_df.show()

review_df.printSchema()
review_df.count()

+------+--------------------+-----+
|   _c0|                text|label|
+------+--------------------+-----+
| 10797|Vantage Point is ...|    0|
|  1774|The movie makes i...|    0|
|  4134|Claire Denis show...|    0|
|198598| The plotting is ...|    1|
| 52380|The [movie's] two...|    0|
| 36362|The first feature...|    1|
|  7981|Misses the sense ...|    0|
|  6049|The movie is a fa...|    1|
| 16033|"... in the end "...|    0|
| 18483|Follows some of t...|    1|
|  4003|"Even in its most...|    1|
|  2486|When the story sh...|    1|
| 20840|A thrilling but f...|    1|
|  9037|It remains watcha...|    1|
|296687| Cusack, who self...|    1|
|176770| Knightley's Suga...|    0|
|  2104| Best of all is P...|    1|
|  2736|None of the filmm...|    0|
| 31659|Filled with clich...|    0|
| 52054|a cloying mess cr...|    0|
+------+--------------------+-----+
only showing top 20 rows

root
 |-- _c0: string (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = tr

1942

In [40]:
# remove stop words to reduce dimensionality
review_df = review_df.withColumn("text", remove_stops_udf(review_df["text"]))

# remove other non essential words
review_df = review_df.withColumn("text", remove_features_udf(review_df["text"]))

# tag the words remaining and keep only Nouns, Verbs and Adjectives
review_df = review_df.withColumn("text", tag_and_remove_udf(review_df["text"]))

# lemmatization of remaining words to reduce dimensionality & boost measures
review_df = review_df.withColumn("text", lemmatize_udf(review_df["text"]))

review_df.show()

+------+--------------------+-----+
|   _c0|                text|label|
+------+--------------------+-----+
| 10797|vantage point thr...|    0|
|  1774|movie make point ...|    0|
|  4134|claire denis show...|    0|
|198598|plot predictable ...|    1|
| 52380|movie half lard e...|    0|
| 36362|first feature cre...|    1|
|  7981|miss sense menace...|    0|
|  6049|movie convince fa...|    1|
| 16033|end pink fraudule...|    0|
| 18483|follow play drug ...|    1|
|  4003|inspired moment s...|    1|
|  2486|story shift nashv...|    1|
| 20840|thrill frivolous ...|    1|
|  9037|remains watchable...|    1|
|296687|cusack mock iconi...|    1|
|176770|knightley sugar p...|    0|
|  2104|best parker perfe...|    1|
|  2736|none filmmaking a...|    0|
| 31659|fill cliche run m...|    0|
| 52054|cloy mess cry rew...|    0|
+------+--------------------+-----+
only showing top 20 rows



In [7]:
# Split training and test data
splits = review_df.randomSplit([0.8, 0.2])
training_df = splits[0]
test_df = splits[1]

training_df.count()

1575

In [8]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol='words', outputCol="features")
idf = IDF(minDocFreq=3, inputCol="features", outputCol="idf")
nb = NaiveBayes()
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])


paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()


cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

cvModel = cv.fit(training_df)

result = cvModel.transform(test_df)
prediction_df = result.select("text", "label", "prediction")
prediction_df.show()

+--------------------+-----+----------+
|                text|label|prediction|
+--------------------+-----+----------+
|tough decent bit ...|    0|       1.0|
|love simon sad mo...|    1|       1.0|
|best representati...|    1|       1.0|
|blend social real...|    1|       1.0|
|glass castle slap...|    0|       1.0|
|better worse movi...|    0|       1.0|
|start try replica...|    0|       1.0|
|godard set intere...|    0|       1.0|
|abundance informa...|    1|       1.0|
|sexy thriller sex...|    0|       1.0|
|fun watch superhe...|    0|       1.0|
|incoherent mishma...|    0|       0.0|
|mija journey comp...|    1|       1.0|
|bernadette linkla...|    0|       1.0|
|fantastic manipul...|    1|       1.0|
|sweet natured fla...|    1|       1.0|
|video don afraid ...|    1|       1.0|
|oooh certain sais...|    1|       1.0|
|fable film remind...|    1|       1.0|
|tale existential ...|    1|       1.0|
+--------------------+-----+----------+
only showing top 20 rows



In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the Accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(result, {evaluator.metricName: "accuracy"})

0.6388140161725068

Classifier B

In [24]:
# Read the data (Spark)
review_df = sqlContext.read.csv('/project/weaklabel_sampledata.csv', header=True).limit(3000)

review_df = review_df.withColumnRenamed('Review','text')
review_df = review_df.withColumnRenamed('Freshness','label')
review_df = review_df.withColumn("label", review_df["label"].cast(IntegerType()))

# remove stop words to reduce dimensionality
review_df = review_df.withColumn("text", remove_stops_udf(review_df["text"]))

# remove other non essential words
review_df = review_df.withColumn("text", remove_features_udf(review_df["text"]))

# tag the words remaining and keep only Nouns, Verbs and Adjectives
review_df = review_df.withColumn("text", tag_and_remove_udf(review_df["text"]))

# lemmatization of remaining words to reduce dimensionality & boost measures
review_df = review_df.withColumn("text", lemmatize_udf(review_df["text"]))

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol='words', outputCol="features")
idf = IDF(minDocFreq=3, inputCol="features", outputCol="idf")
nb = NaiveBayes()
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])


paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()


cv = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=MulticlassClassificationEvaluator(), 
                    numFolds=4)

cvModel = cv.fit(training_df)

result = cvModel.transform(test_df)
prediction_df = result.select("text", "label", "prediction")

#Evaluate the accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(result, {evaluator.metricName: "accuracy"})

NameError: name 'Tokenizer' is not defined

In [25]:
# Read the data (Spark)
review_df = sqlContext.read.csv('/project/weaklabel_sampledata.csv', header=True).limit(3000)

review_df = review_df.withColumnRenamed('Review','text')
review_df = review_df.withColumnRenamed('Freshness','label')
review_df = review_df.withColumn("label", review_df["label"].cast(IntegerType()))

review_df.show()

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0| Parental Content...|
|    1| Director Wayne W...|
|    0| the lack of dram...|
|    1| A riveting docum...|
|    0| The problem with...|
|    0| Too tepid and to...|
|    0| Bernie lacks the...|
|    0| Simply one of th...|
|    0|" More drab ""fi"...|
|    1| With its emphasi...|
|    1| It's not like ha...|
|    1|The intelligently...|
|    0| Yields at least ...|
|    0| The 'Mr. Holland...|
|    0| The Lawnmower Ma...|
|    1| The Cell's strik...|
|    0| Director Takashi...|
|    1| [House of Wax] g...|
|    1| treats the audie...|
|    1|This movie's calm...|
+-----+--------------------+
only showing top 20 rows



In [37]:
check = pd.read_csv('/project/12000_predicted_labels.csv')
check.to_csv('12000_predicted_labels_v2.csv', sep='~')

Unnamed: 0.1,Unnamed: 0,Review,predicted_train
0,25985,The film reveals Eli Roth hesitantly as a fami...,1
1,117079,There simply has yet to be a decent film adap...,1
2,85518,"There's not much resonance, which is a shame ...",1
3,8395,"This ugly and rather mean-spirited film, with ...",1
4,5162,The film's indefinability is admirable while a...,0
5,287,My Dog Skip probes the concaves of memory to t...,1
6,2263,It makes for good family viewing.,1
7,39415,You may find yourself waiting for a commercia...,1
8,49991,"After his first two laughably bad films, direc...",1
9,39451,This is a compelling fairy tale that will lea...,1


In [31]:
review_df_2 = sqlContext.read.csv('/project/12000_predicted_labels_v2.csv', header=True,sep='~')



review_df_2.show()

+---+----------+--------------------+---------------+
|_c0|Unnamed: 0|              Review|predicted_train|
+---+----------+--------------------+---------------+
|  0|     25985|The film reveals ...|              1|
|  1|    117079| There simply has...|              1|
|  2|     85518| There's not much...|              0|
|  3|      8395|This ugly and rat...|              0|
|  4|      5162|The film's indefi...|              0|
|  5|       287|My Dog Skip probe...|              1|
|  6|      2263|It makes for good...|              0|
|  7|     39415| You may find you...|              0|
|  8|     49991|After his first t...|              0|
|  9|     39451| This is a compel...|              1|
| 10|     41232|"The World's End ...|              0|
| 11|     40089|Old and new, Skyf...|              1|
| 12|        75|37 Seconds is ove...|              0|
| 13|      5676|Here's something ...|              0|
| 14|    365234| The Shack does a...|              1|
| 15|     28804|I know, I kn

In [29]:

review_df.show()
review_df.count()

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0| Parental Content...|
|    1| Director Wayne W...|
|    0| the lack of dram...|
|    1| A riveting docum...|
|    0| The problem with...|
|    0| Too tepid and to...|
|    0| Bernie lacks the...|
|    0| Simply one of th...|
|    0|" More drab ""fi"...|
|    1| With its emphasi...|
|    1| It's not like ha...|
|    1|The intelligently...|
|    0| Yields at least ...|
|    0| The 'Mr. Holland...|
|    0| The Lawnmower Ma...|
|    1| The Cell's strik...|
|    0| Director Takashi...|
|    1| [House of Wax] g...|
|    1| treats the audie...|
|    1|This movie's calm...|
+-----+--------------------+
only showing top 20 rows



38994

In [38]:
# Read the data (Spark)
review_df_2000 = review_df
review_df_12000 = sqlContext.read.csv('/project/12000_predicted_labels_v2.csv', header=True,sep='~')

# Remove Null
review_df_12000 = review_df_12000.filter(review_df_12000.predicted_train.isNotNull())

review_df_12000 = review_df_12000.withColumnRenamed('Review','text')
review_df_12000 = review_df_12000.withColumnRenamed('predicted_train','label')
review_df_12000 = review_df_12000.withColumn("label", review_df_12000["label"].cast(IntegerType()))

# Combine with review_df with 2000 labels
review_df_combined = review_df.union(review_df_12000.select(['label','text']))

review_df_combined.show()
review_df_combined.count()

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0| Parental Content...|
|    1| Director Wayne W...|
|    0| the lack of dram...|
|    1| A riveting docum...|
|    0| The problem with...|
|    0| Too tepid and to...|
|    0| Bernie lacks the...|
|    0| Simply one of th...|
|    0|" More drab ""fi"...|
|    1| With its emphasi...|
|    1| It's not like ha...|
|    1|The intelligently...|
|    0| Yields at least ...|
|    0| The 'Mr. Holland...|
|    0| The Lawnmower Ma...|
|    1| The Cell's strik...|
|    0| Director Takashi...|
|    1| [House of Wax] g...|
|    1| treats the audie...|
|    1|This movie's calm...|
+-----+--------------------+
only showing top 20 rows



50992

In [None]:
# remove stop words to reduce dimensionality
review_df = review_df.withColumn("text", remove_stops_udf(review_df["text"]))

# remove other non essential words
review_df = review_df.withColumn("text", remove_features_udf(review_df["text"]))

# tag the words remaining and keep only Nouns, Verbs and Adjectives
review_df = review_df.withColumn("text", tag_and_remove_udf(review_df["text"]))

# lemmatization of remaining words to reduce dimensionality & boost measures
review_df = review_df.withColumn("text", lemmatize_udf(review_df["text"]))

training_df = review_df

training_df.show()
training_df.count()