In [0]:
/FileStore/tables/positive_words_txt.txt
File uploaded to /FileStore/tables/negative_words_txt.txt

In [0]:
import pandas as pd

In [0]:
import pyspark

In [0]:
########Predicting food poisoning with Yelp data#####################


from pyspark.sql.functions import *
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

from pyspark.mllib.classification import SVMModel, SVMWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors as MLLibVectors
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import * 
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder


from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

import pandas as pd
import string 
import re

In [0]:
sparkDF1 = spark.read.csv("/FileStore/tables/negative_words_txt.txt")

In [0]:
sparkDF = spark.read.csv("/FileStore/tables/positive_words_txt.txt")

In [0]:

positive_list=(sparkDF.select('_c0').rdd.flatMap(lambda x: x).collect())

In [0]:
positive_list

Out[37]: ['abound',
 'abounds',
 'abundance',
 'abundant',
 'accessable',
 'accessible',
 'acclaim',
 'acclaimed',
 'acclamation',
 'accolade',
 'accolades',
 'accommodative',
 'accomodative',
 'accomplish',
 'accomplished',
 'accomplishment',
 'accomplishments',
 'accurate',
 'accurately',
 'achievable',
 'achievement',
 'achievements',
 'achievible',
 'acumen',
 'adaptable',
 'adaptive',
 'adequate',
 'adjustable',
 'admirable',
 'admirably',
 'admiration',
 'admire',
 'admirer',
 'admiring',
 'admiringly',
 'adorable',
 'adore',
 'adored',
 'adorer',
 'adoring',
 'adoringly',
 'adroit',
 'adroitly',
 'adulate',
 'adulation',
 'adulatory',
 'advanced',
 'advantage',
 'advantageous',
 'advantageously',
 'advantages',
 'adventuresome',
 'adventurous',
 'advocate',
 'advocated',
 'advocates',
 'affability',
 'affable',
 'affably',
 'affectation',
 'affection',
 'affectionate',
 'affinity',
 'affirm',
 'affirmation',
 'affirmative',
 'affluence',
 'affluent',
 'afford',
 'affordable',
 '

In [0]:

negative_list=(sparkDF1.select('_c0').rdd.flatMap(lambda x: x).collect())

In [0]:
negative_list

Out[40]: ['abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted',
 'aborts',
 'abrade',
 'abrasive',
 'abrupt',
 'abruptly',
 'abscond',
 'absence',
 'absent-minded',
 'absentee',
 'absurd',
 'absurdity',
 'absurdly',
 'absurdness',
 'abuse',
 'abused',
 'abuses',
 'abusive',
 'abysmal',
 'abysmally',
 'abyss',
 'accidental',
 'accost',
 'accursed',
 'accusation',
 'accusations',
 'accuse',
 'accuses',
 'accusing',
 'accusingly',
 'acerbate',
 'acerbic',
 'acerbically',
 'ache',
 'ached',
 'aches',
 'achey',
 'aching',
 'acrid',
 'acridly',
 'acridness',
 'acrimonious',
 'acrimoniously',
 'acrimony',
 'adamant',
 'adamantly',
 'addict',
 'addicted',
 'addicting',
 'addicts',
 'admonish',
 'admonisher',
 'admonishingly',
 'admonishment',
 'admonition',
 'adulterate',
 'adulterated',
 'adulteration',
 'adulterier',
 'adversarial',
 'adversary',
 'adverse',
 'adversity',
 'afflict',
 'affliction',
 'afflictive',
 'affront',
 'afraid',
 'ag

In [0]:
#s3://shivaniproj/parquets/yelp_academic_dataset_review/
review = spark.read.load("s3://shivaniproj/parquets/yelp_academic_dataset_review/",format="parquet")

In [0]:
review.cache()

Out[123]: DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: double, text: string, useful: bigint, user_id: string]

In [0]:
review.show(2)

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|y7Ssng5Bnl75QiZUE...|   0|2016-03-06 08:49:08|    0|lyjloY7uVRsaOZObr...|  4.0|Ever wanted too j...|     0|Vu1P84ifxzFeQ9qRF...|
|4fzjAjzLCnjWfQP1s...|   0|2013-12-04 22:37:18|    0|GfPzojSq1cin_fOdf...|  5.0|Dry-aged bone-in ...|     0|J1n84Z7tRI8Y7nhFU...|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
only showing top 2 rows



In [0]:
# remove punctuation
def remove_punct(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text) 
    return nopunct
    
# binarize rating
def convert_rating(rating):
    rating = int(rating)
    if rating <2: return 0
    else: return 1

# udf
punct_remover = udf(lambda x: remove_punct(x))
rating_convert = udf(lambda x: convert_rating(x))
review_df = review.select('review_id', punct_remover('text'), rating_convert('stars'))
review_df = review_df.withColumnRenamed('<lambda>(text)', 'text')\
                     .withColumn('label', review_df["<lambda>(stars)"].cast(IntegerType()))\
                     .drop('<lambda>(stars)')\
                     .limit(500)

In [0]:
review_df.show(5)

+--------------------+--------------------+-----+
|           review_id|                text|label|
+--------------------+--------------------+-----+
|lyjloY7uVRsaOZObr...|Ever wanted too j...|    1|
|GfPzojSq1cin_fOdf...|Dry aged bone in ...|    1|
|lFphcMI6ZN2uFA8wg...|I lived in Boston...|    1|
|oQDDv49nI55lHzsI3...|Some good things ...|    1|
|78ZDs404elxc0f1i-...|Great place to gr...|    1|
+--------------------+--------------------+-----+
only showing top 5 rows



In [0]:
review_df_itr=review_df.toPandas()
texts=[]
for review in review_df_itr.text:
    texts.append(review)

In [0]:
texts

Out[91]: ['Ever wanted too just take a bite out of a fully grown hogs gut  That is what the five guys experience is about for me  The burgers are unreal and extremely good  Also they are generous with the fries that overflow into the bag  but the amount of fries used to be better  But the one thing that is distracting from the experience is all of the peanut corpses littered around the place  It is nut genocide  Makes me feel very uncomfortable ',
 'Dry aged bone in ribeye  Best steak I ve ever eaten   The corn pudding was also delicious and exceeded expectations  We loved the lobster Napoleon  but I wouldn t order it again unless we were splurging  The steak is  so delicious and filling that you won t need an appetizer  The truffle mashed potatoes were delicious  but they were also what we expected from this quality of restaurant  The mac and cheese was good  but not our favorite kind  It tasted like it was trying too hard   I personally would pass on the cheesecake  It was good  but 

In [0]:
def tokenize(texts):
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    texts_tokens = []
    for i, val in enumerate(texts):
        text_tokens = tokenizer.tokenize(val.lower())

        for i in range(len(text_tokens) - 1, -1, -1):
            if len(text_tokens[i]) < 4:
                del(text_tokens[i])

        texts_tokens.append(text_tokens)
        
    return texts_tokens

In [0]:
import nltk
texts_tokens = tokenize(texts)

In [0]:
texts_tokens

Out[26]: [['ever',
  'wanted',
  'just',
  'take',
  'bite',
  'fully',
  'grown',
  'hogs',
  'that',
  'what',
  'five',
  'guys',
  'experience',
  'about',
  'burgers',
  'unreal',
  'extremely',
  'good',
  'also',
  'they',
  'generous',
  'with',
  'fries',
  'that',
  'overflow',
  'into',
  'amount',
  'fries',
  'used',
  'better',
  'thing',
  'that',
  'distracting',
  'from',
  'experience',
  'peanut',
  'corpses',
  'littered',
  'around',
  'place',
  'genocide',
  'makes',
  'feel',
  'very',
  'uncomfortable'],
 ['aged',
  'bone',
  'ribeye',
  'best',
  'steak',
  'ever',
  'eaten',
  'corn',
  'pudding',
  'also',
  'delicious',
  'exceeded',
  'expectations',
  'loved',
  'lobster',
  'napoleon',
  'wouldn',
  'order',
  'again',
  'unless',
  'were',
  'splurging',
  'steak',
  'delicious',
  'filling',
  'that',
  'need',
  'appetizer',
  'truffle',
  'mashed',
  'potatoes',
  'were',
  'delicious',
  'they',
  'were',
  'also',
  'what',
  'expected',
  'from',


In [0]:
def removeSW(texts_tokens):
    stopWords = set(stopwords.words('english'))
    texts_filtered = []

    for i, val in enumerate(texts_tokens):
        text_filtered = []
        for w in val:
            if w not in stopWords:
                text_filtered.append(w)
        texts_filtered.append(text_filtered)
        
    return texts_filtered

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Out[28]: True

In [0]:
stopwords.words('english')

Out[30]: ['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both

In [0]:
texts_filtered = removeSW(texts_tokens)

In [0]:
label=[]
for sentence in texts_filtered:
    for tokens in sentence:
        lp=[]
        ln=[]
        if tokens in positive_list:
            lp.append(tokens)
        if tokens in negative_list:
            ln.append(tokens)
    if len(lp)>len(ln):
        label.append("P")
    else:
        label.append("N")

In [0]:
final_label=label

In [0]:
type(texts_filtered)


Out[59]: list

In [0]:
dataframe = spark.createDataFrame(zip(texts,label), ["text","label"])

In [0]:
dataframe.show(10,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(texts).toarray()


In [0]:
df10=dataframe.toPandas()

In [0]:
y=df10.iloc[:,-1].values

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [0]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

Out[104]: SVC(kernel='linear', random_state=0)

In [0]:
import numpy as np
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['P' 'N']
 ['N' 'P']
 ['N' 'N']
 ['N' 'N']
 ['N' 'P']
 ['N' 'N']
 ['N' 'N']
 ['P' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['P' 'N']
 ['P' 'N']
 ['N' 'N']
 ['N' 'N']
 ['P' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'P']
 ['N' 'N']
 ['N' 'N']
 ['N' 'P']
 ['N' 'N']
 ['N' 'N']
 ['P' 'P']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['P' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['P' 'P']
 ['N' 'N']
 ['N' 'P']
 ['N' 'N']
 ['P' 'N']
 ['P' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'P']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'P']
 ['N' 'N']
 ['P' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['P' 'P']
 ['N' 'P']
 ['P' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['P' 'N']
 ['N' 'P']
 ['N' 'N']
 ['P' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['N' 'N']
 ['P' 'N']
 ['N' 'N']
 ['P' 'N']
 ['P' 'P']
 ['N' 'N']
 ['N' 'N']
 ['P' 'N']

In [0]:

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[70 16]
 [ 9  5]]
Out[106]: 0.75

In [0]:
new_review = 'It s like the Walmart of sports merchandise  The prices aren t that great  it s out in the boonies  it s crowded  and Nike gear isn t that great  Some people obviously like it  but it was not for me '
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

['N']


In [0]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=0.8)
mnb.fit(X_train,y_train)

Out[113]: MultinomialNB(alpha=0.8)

In [0]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=0.5)
bnb.fit(X_train,y_train)
y_pred_bnb=mnb.predict(X_test)

In [0]:
y_pred_bnb

Out[115]: array(['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'P', 'N', 'P',
       'N', 'N', 'N', 'P', 'N', 'P', 'N', 'P', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'P', 'N', 'N', 'N', 'P',
       'N', 'P', 'N', 'N', 'P', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'P',
       'N', 'N', 'P', 'N', 'N', 'N', 'N', 'N', 'N'], dtype='<U1')

In [0]:
accuracy_score(y_test, y_pred_bnb)

Out[117]: 0.79

In [0]:
dataframe.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Ever wanted too j...|    N|
|Dry aged bone in ...|    N|
|I lived in Boston...|    N|
|Some good things ...|    N|
|Great place to gr...|    N|
|It s like the Wal...|    P|
|The food was incr...|    N|
|Haven t been so h...|    P|
|These guys helped...|    N|
|great service and...|    P|
+--------------------+-----+
only showing top 10 rows



In [0]:
df11=review_df.filter(review_df.label=="0")

In [0]:
df11.show(5)

+--------------------+--------------------+-----+
|           review_id|                text|label|
+--------------------+--------------------+-----+
|Lis1PdpSAKYO2Xphi...|It s like the Wal...|    0|
|HtlSBk4RQAVPqfW7Z...|Well this place h...|    0|
|rE4fSpd05jpGMbe4S...|I went to this pl...|    0|
|xLRe59g6MnWxXaJny...|Ordered a fish sa...|    0|
|xmaEZy2Add1VpUnF1...|The fact they got...|    0|
+--------------------+--------------------+-----+
only showing top 5 rows



In [0]:
review_df.show(5)

+--------------------+--------------------+-----+
|           review_id|                text|label|
+--------------------+--------------------+-----+
|lyjloY7uVRsaOZObr...|Ever wanted too j...|    1|
|GfPzojSq1cin_fOdf...|Dry aged bone in ...|    1|
|lFphcMI6ZN2uFA8wg...|I lived in Boston...|    1|
|oQDDv49nI55lHzsI3...|Some good things ...|    1|
|78ZDs404elxc0f1i-...|Great place to gr...|    1|
+--------------------+--------------------+-----+
only showing top 5 rows



In [0]:
dataframe.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Ever wanted too j...|    N|
|Dry aged bone in ...|    N|
|I lived in Boston...|    N|
|Some good things ...|    N|
|Great place to gr...|    N|
|It s like the Wal...|    P|
|The food was incr...|    N|
|Haven t been so h...|    P|
|These guys helped...|    N|
|great service and...|    P|
+--------------------+-----+
only showing top 10 rows



In [0]:
df.withColumn("bonus_percent").show()