In [19]:
# Import Files
import pandas as pd
import numpy as np
import nltk
import string

# Download files
nltk.download('punkt')
nltk.download('stopwords')

# Spark Environment
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pyspark

number_cores = 4
memory_gb = 16
conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = pyspark.SparkContext.getOrCreate(conf=conf)
print(sc)

<SparkContext master=local[4] appName=pyspark-shell>


[nltk_data] Downloading package punkt to /home/faculty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# get the context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
print(spark) 

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

<pyspark.sql.session.SparkSession object at 0x7fd198312240>


In [56]:
# Read the data (Pandas)

reviews = pd.read_csv('/project/weaklabel_sampledata.csv', header=0, encoding='unicode_escape')
reviews.head()

# Read the data (Spark)

spark_reviews = sqlContext.read.csv('/project/weaklabel_sampledata.csv', header=True)
spark_reviews_rdd = spark_reviews.rdd
spark_reviews_rdd.take(5)

[Row(Freshness='0', Review=' Parental Content Review'),
 Row(Freshness='1', Review=' Director Wayne Wang proves with Maid in Manhattan that even predictable movies can leave you feeling warm and fuzzy.'),
 Row(Freshness='0', Review=" the lack of dramatic development doesn't leave much for the actors to work with"),
 Row(Freshness='1', Review=' A riveting documentary that explains education and the violent changes that are rocking Pakistan.'),
 Row(Freshness='0', Review=" The problem with The Informant!, aside from that overcoaxing exclamation mark, is that it's not especially funny.")]

In [61]:
# (Pandas)

# Turn 'Review' column into list
reviews_list = reviews['Review'].astype(str).tolist()

# Clean the data and tokenize it
reviews_string = ''
reviews_string = reviews_string.join(reviews_list).lower()
reviews_string = reviews_string.replace('"', '').replace("'", '').replace('\n','').replace(',','').replace('[','').replace(']','')
tokens = reviews_string.split()

# Check how many words we have
len(tokens)

# (Spark)

reviews_list = spark_reviews.select("Review").rdd.flatMap(lambda x: x).collect()
reviews_list = reviews_list

AttributeError: 'list' object has no attribute 'lower'

In [29]:
from nltk.corpus import stopwords

# Remove punctuation from each token
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]

# Remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]

# Filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

# Filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

len(tokens)

60915

In [30]:
from operator import itemgetter
from collections import Counter

# Count how many times each word appears
count = Counter(tokens).items()
sorted_count = sorted(count, key = itemgetter(1))
sorted_count.reverse()

In [31]:
from nltk import LancasterStemmer

# initialize Lancaster Stemmer
LS = LancasterStemmer()
lemmatized = []
for l in tokens: lemmatized.append(LS.stem(l))

# Count how many times each word appears
count = Counter(lemmatized).items()
sorted_count = sorted(count, key = itemgetter(1))
sorted_count.reverse()

# Select 5000 most frequent words
top5000 = [i[0] for i in sorted_count[:5000]]
top7000 = [i[0] for i in sorted_count[:7000]]

In [34]:

from nltk.tokenize import word_tokenize

review = []

for sentence in reviews['Review'] :
    sentence = sentence.lower()
    sentence = sentence.replace('.', '').replace("'", '').replace('\n','').replace(',','')
    token_sentence = word_tokenize(sentence)
    
    token_words = []
    for token_word in token_sentence:
        token_word = LS.stem(token_word)
        token_words.append(token_word)
    review.append(token_words)
len(review)

5000

In [38]:
# (Pandas)

word_matrix = []

for i in review: word_matrix.append([1 if j in i else 0 for j in top5000])
features = pd.DataFrame(word_matrix, columns = top5000, index = reviews.index)
features['freshness']=reviews['Freshness']
features

Unnamed: 0,film,movy,lik,on,mak,act,ev,story,charact,feel,...,showbo,scrappy,withinsweaty,minorkey,glasss,medy,yel,denzel,behemo,freshness
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# (Spark)

#spark_word_matrix = []
rdd = spark.sparkContext.parallelize(word_matrix)
spark_features = sqlContext.createDataFrame(rdd)
spark_features.take(5)

[Row(_1=0, _2=0, _3=0, _4=0, _5=0, _6=0, _7=0, _8=0, _9=0, _10=0, _11=0, _12=0, _13=0, _14=0, _15=0, _16=0, _17=0, _18=0, _19=0, _20=0, _21=0, _22=0, _23=0, _24=0, _25=0, _26=0, _27=0, _28=0, _29=0, _30=0, _31=0, _32=0, _33=0, _34=0, _35=0, _36=0, _37=0, _38=0, _39=0, _40=0, _41=0, _42=0, _43=0, _44=0, _45=0, _46=0, _47=0, _48=0, _49=0, _50=0, _51=0, _52=0, _53=0, _54=0, _55=0, _56=0, _57=0, _58=0, _59=0, _60=0, _61=0, _62=0, _63=0, _64=0, _65=0, _66=0, _67=1, _68=0, _69=0, _70=0, _71=0, _72=0, _73=0, _74=0, _75=0, _76=0, _77=0, _78=0, _79=0, _80=0, _81=0, _82=0, _83=0, _84=0, _85=0, _86=0, _87=0, _88=0, _89=0, _90=0, _91=0, _92=0, _93=0, _94=0, _95=0, _96=0, _97=0, _98=0, _99=0, _100=0, _101=0, _102=0, _103=0, _104=0, _105=0, _106=0, _107=0, _108=0, _109=0, _110=0, _111=0, _112=0, _113=0, _114=0, _115=0, _116=0, _117=0, _118=0, _119=0, _120=0, _121=0, _122=0, _123=0, _124=0, _125=0, _126=0, _127=0, _128=0, _129=0, _130=0, _131=0, _132=0, _133=0, _134=0, _135=0, _136=0, _137=0, _138=0,

In [25]:

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

train, test = train_test_split(features, test_size = 0.1)

cols = train.columns[:-1]

lr = LogisticRegression()
gnb = MultinomialNB()

models = [lr,gnb]

for model in models:
    model.fit(train[cols], train['freshness'])
    y_pred = model.predict(test[cols])

    print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
          .format(test.shape[0], (test["freshness"] != y_pred).sum(),
                  100*(1-(test["freshness"] != y_pred).sum()/test.shape[0]))
         )

Number of mislabeled points out of a total 500 points : 139, performance 72.20%
Number of mislabeled points out of a total 500 points : 147, performance 70.60%


AttributeError: 'LogisticRegression' object has no attribute 'name'