In [19]:
# Import Files
import pandas as pd
import numpy as np
import nltk
import string

# Download files
nltk.download('punkt')
nltk.download('stopwords')

# Spark Environment
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pyspark

number_cores = 4
memory_gb = 16
conf = (
    pyspark.SparkConf()
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
sc = pyspark.SparkContext.getOrCreate(conf=conf)
print(sc)

<SparkContext master=local[4] appName=pyspark-shell>


[nltk_data] Downloading package punkt to /home/faculty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# get the context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
print(spark) 

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

<pyspark.sql.session.SparkSession object at 0x7fd198312240>


In [56]:
# Read the data (Pandas)

reviews = pd.read_csv('/project/weaklabel_sampledata.csv', header=0, encoding='unicode_escape')
reviews.head()

# Read the data (Spark)

spark_reviews = sqlContext.read.csv('/project/weaklabel_sampledata.csv', header=True)
spark_reviews_rdd = spark_reviews.rdd
spark_reviews_rdd.take(5)

[Row(Freshness='0', Review=' Parental Content Review'),
 Row(Freshness='1', Review=' Director Wayne Wang proves with Maid in Manhattan that even predictable movies can leave you feeling warm and fuzzy.'),
 Row(Freshness='0', Review=" the lack of dramatic development doesn't leave much for the actors to work with"),
 Row(Freshness='1', Review=' A riveting documentary that explains education and the violent changes that are rocking Pakistan.'),
 Row(Freshness='0', Review=" The problem with The Informant!, aside from that overcoaxing exclamation mark, is that it's not especially funny.")]

In [63]:
# (Pandas)

# Turn 'Review' column into list
reviews_list = reviews['Review'].astype(str).tolist()

# Clean the data and tokenize it
reviews_string = ''
reviews_string = reviews_string.join(reviews_list).lower()
reviews_string = reviews_string.replace('"', '').replace("'", '').replace('\n','').replace(',','').replace('[','').replace(']','')
tokens = reviews_string.split()

# Check how many words we have
len(tokens)

# (Spark)

#Turn 'Review' column from Spark Dataframe into List
reviews_list = spark_reviews.select("Review").rdd.flatMap(lambda x: x).collect()

# Clean the data and tokenize it
reviews_string = ''
reviews_string = reviews_string.join(reviews_list).lower()
reviews_string = reviews_string.replace('"', '').replace("'", '').replace('\n','').replace(',','').replace('[','').replace(']','')
tokens = reviews_string.split()
len(tokens)

105564

In [29]:
from nltk.corpus import stopwords

# Remove punctuation from each token
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]

# Remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]

# Filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

# Filter out short tokens
tokens = [word for word in tokens if len(word) > 1]

len(tokens)

60915

In [30]:
from operator import itemgetter
from collections import Counter

# Count how many times each word appears
count = Counter(tokens).items()
sorted_count = sorted(count, key = itemgetter(1))
sorted_count.reverse()

In [31]:
from nltk import LancasterStemmer

# initialize Lancaster Stemmer
LS = LancasterStemmer()
lemmatized = []
for l in tokens: lemmatized.append(LS.stem(l))

# Count how many times each word appears
count = Counter(lemmatized).items()
sorted_count = sorted(count, key = itemgetter(1))
sorted_count.reverse()

# Select 5000 most frequent words
top5000 = [i[0] for i in sorted_count[:5000]]
top7000 = [i[0] for i in sorted_count[:7000]]

In [64]:

from nltk.tokenize import word_tokenize

review = []

for sentence in reviews_list :
    sentence = sentence.lower()
    sentence = sentence.replace('.', '').replace("'", '').replace('\n','').replace(',','')
    token_sentence = word_tokenize(sentence)
    
    token_words = []
    for token_word in token_sentence:
        token_word = LS.stem(token_word)
        token_words.append(token_word)
    review.append(token_words)
len(review)

5000

In [38]:
# (Pandas)

word_matrix = []

for i in review: word_matrix.append([1 if j in i else 0 for j in top5000])
features = pd.DataFrame(word_matrix, columns = top5000, index = reviews.index)
features['freshness']=reviews['Freshness']
features.head(5)

Unnamed: 0,film,movy,lik,on,mak,act,ev,story,charact,feel,...,showbo,scrappy,withinsweaty,minorkey,glasss,medy,yel,denzel,behemo,freshness
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
# (Spark)

#spark_word_matrix = []
rdd = spark.sparkContext.parallelize(word_matrix)
spark_features_rdd = sqlContext.createDataFrame(rdd, schema=top5000).rdd
spark_features_rdd.take(1)

[Row(film=0, movy=0, lik=0, on=0, mak=0, act=0, ev=0, story=0, charact=0, feel=0, real=0, much=0, tim=0, direct=0, perform=0, good=0, get=0, com=0, tak=0, doesnt=0, us=0, way=0, comedy=0, nev=0, work=0, man=0, end=0, ful=0, look=0, watch=0, enough=0, lov=0, giv=0, gre=0, littl=0, funny=0, best=0, ther=0, that=0, fin=0, mad=0, lif=0, entertain=0, thing=0, play=0, first=0, hum=0, audy=0, would=0, seem=0, fun=0, tal=0, see=0, wel=0, bet=0, vis=0, could=0, may=0, sens=0, bad=0, two=0, emot=0, year=0, dram=0, cast=0, mom=0, review=1, every=0, might=0, stil=0, plot=0, dont=0, new=0, isnt=0, set=0, many=0, view=0, world=0, beauty=0, want=0, cre=0, far=0, long=0, big=0, tru=0, also=0, find=0, becom=0, mat=0, laugh=0, someth=0, right=0, bit=0, rom=0, scen=0, hor=0, del=0, cinem=0, hard=0, less=0, though=0, star=0, car=0, filmmak=0, part=0, turn=0, thriller=0, anoth=0, interest=0, nee=0, peopl=0, heart=0, say=0, quit=0, script=0, without=0, know=0, sery=0, yet=0, pow=0, show=0, minut=0, span=0, 

In [25]:

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

train, test = train_test_split(features, test_size = 0.1)

cols = train.columns[:-1]

lr = LogisticRegression()
gnb = MultinomialNB()

models = [lr,gnb]

for model in models:
    model.fit(train[cols], train['freshness'])
    y_pred = model.predict(test[cols])

    print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
          .format(test.shape[0], (test["freshness"] != y_pred).sum(),
                  100*(1-(test["freshness"] != y_pred).sum()/test.shape[0]))
         )

Number of mislabeled points out of a total 500 points : 139, performance 72.20%
Number of mislabeled points out of a total 500 points : 147, performance 70.60%


AttributeError: 'LogisticRegression' object has no attribute 'name'