In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

from pyspark.mllib.classification import SVMModel, SVMWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors as MLLibVectors
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import * 
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder


from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

import pandas as pd
import string 
import re 

In [0]:
review = spark.read.load("s3://yelp-project-special-26/data-source/yelp_academic_dataset_review.json",format="json")

In [0]:
dbutils.fs.mkdirs("dbfs:/databricks/scripts/")

In [0]:
dbutils.fs.put("/databricks/scripts/nltk-install.sh",""" #!/bin/bash python -m pip install nltk python -m pip install --upgrade pip python -m nltk.downloader all """, True)

In [0]:
display(dbutils.fs.ls("/databricks/scripts/nltk-install.sh"))

In [0]:
!pip install s3fs

In [0]:
review.cache()

In [0]:
# remove punctuation
def remove_punct(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text) 
    return nopunct
    
# binarize rating
def convert_rating(rating):
    rating = int(rating)
    if rating < 2: return 1
    else: return 0

# udf
punct_remover = udf(lambda x: remove_punct(x))
rating_convert = udf(lambda x: convert_rating(x))
review_df = review.select('review_id', punct_remover('text'), rating_convert('stars'))
review_df = review_df.withColumnRenamed('<lambda>(text)', 'text')\
                     .withColumn('label', review_df["<lambda>(stars)"].cast(IntegerType()))\
                     .drop('<lambda>(stars)')\
                     .limit(5000)

In [0]:
review_df.show()

In [0]:
df=review_df.filter(review_df.label=='1')

In [0]:
#df.show(10)

In [0]:
df_badreviews=df.toPandas()

In [0]:
texts = []
for review in df_badreviews.text:
    texts.append(review)

In [0]:
texts

In [0]:
#Cleaning techniques #1 : Tokenization

In [0]:
import pandas as pd
import json
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

In [0]:
def tokenize(texts):
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    texts_tokens = []
    for i, val in enumerate(texts):
        text_tokens = tokenizer.tokenize(val.lower())

        for i in range(len(text_tokens) - 1, -1, -1):
            if len(text_tokens[i]) < 4:
                del(text_tokens[i])

        texts_tokens.append(text_tokens)
        
    return texts_tokens

In [0]:
texts_tokens = tokenize(texts)

texts_tokens[:1]

In [0]:
!pip install nltk

In [0]:
#Cleaning techniques #2 : Removing stopwords

In [0]:
def removeSW(texts_tokens):
    stopWords = set(stopwords.words('english'))
    texts_filtered = []

    for i, val in enumerate(texts_tokens):
        text_filtered = []
        for w in val:
            if w not in stopWords:
                text_filtered.append(w)
        texts_filtered.append(text_filtered)
        
    return texts_filtered

In [0]:
nltk.download('stopwords')

In [0]:
texts_filtered = removeSW(texts_tokens)
texts_filtered[:1]

In [0]:
len(texts_filtered)

In [0]:
#Cleaning techniques #3 : Lemma

In [0]:
def lemma(texts_filtered):
    wordnet_lemmatizer = WordNetLemmatizer()
    texts_lem = []

    for i, val in enumerate(texts_filtered):
        text_lem = []
        for word in val:
            text_lem.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        texts_lem.append(text_lem)
    
    return texts_lem

In [0]:
nltk.download('wordnet')

In [0]:
texts_lem = lemma(texts_filtered)

texts_lem[:1]

In [0]:
#BIGRAMS AND TRIGRAMS


In [0]:
nltk.download('wordnet')

In [0]:
len(texts_lem)

In [0]:
#Joinning

In [0]:
texts_string = []
for text in texts_lem:
    string = ' '
    string = string.join(text)
    texts_string.append(string)

In [0]:
nltk.download('stopwords')

In [0]:
texts_string[:10]

In [0]:
#LDA analysis with Sklearn

In [0]:
import pandas as pd
import json
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

In [0]:
def plot_top_words(model, feature_names, n_top_words, title):
    #Modified from SKlearn
    fig, axes = plt.subplots(2, 5, figsize=(15, 7))
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 15})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=15)
        ax.tick_params(bottom=False)
        ax.set(xticklabels=[])

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [0]:
vectorizer = CountVectorizer(max_df=0.90, min_df=5)


In [0]:
X = vectorizer.fit_transform(texts_string)
feature_names =  vectorizer.get_feature_names()

X.toarray().shape

In [0]:
n_topics = 10

lda = LatentDirichletAllocation(
        n_components=n_topics,
        max_iter=12
)

lda.fit_transform(X)

In [0]:
lda.n_iter_

In [0]:
#Displaying result of the analysis.

In [0]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, feature_names, no_top_words)

In [0]:
plot_top_words(lda, feature_names, no_top_words,'Topics in LDa')

In [0]:
#NMF analysis with Sklearn

In [0]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, 
    min_df=5,  
    stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(texts_string)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [0]:
from sklearn.decomposition import NMF


In [0]:
no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
nmf.fit_transform(tfidf)

In [0]:
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

In [0]:
plot_top_words(nmf, tfidf_feature_names, no_top_words,'Topics in NMF')

In [0]:
#LDA BoW and LDA TF-IDF with gensim

In [0]:

import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [0]:
!pip install pyLDAvis

In [0]:
!pip install -U gensim

In [0]:
#pip install --upgrade pip

In [0]:
dictionary = gensim.corpora.Dictionary(texts_lem)
len(dictionary.cfs)

In [0]:
dictionary.filter_extremes(no_below=15, no_above=0.5)
len(dictionary.cfs)

In [0]:
bow_corpus = [dictionary.doc2bow(doc) for doc in texts_lem]

In [0]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(bow_corpus))

In [0]:
tfidf = gensim.models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [0]:
#LDA using BoW

In [0]:
lda_model_coherence = []
for i in range (2,15):
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=i, id2word=dictionary, passes=2, workers=4)
    cm = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    lda_model_coherence.append(coherence)

In [0]:
plt.plot(range(2, 15),lda_model_coherence)
plt.xlabel('Number of topics')
plt.ylabel('Coherence score')
plt.title('How many topics ? (Closer to 0 = better)')
plt.show()

In [0]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=8, id2word=dictionary, passes=2, workers=4)

In [0]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [0]:
vis = gensimvis.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [0]:
#LDA using TF-IDF

In [0]:
lda_model_tfidf_coherence = []
for i in range (2,15):
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=i, id2word=dictionary, passes=2, workers=4)
    cm = gensim.models.CoherenceModel(model=lda_model_tfidf, corpus=bow_corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    lda_model_tfidf_coherence.append(coherence)

In [0]:
plt.plot(range(2, 15),lda_model_coherence)
plt.xlabel('Number of topics')
plt.ylabel('Coherence score')
plt.title('How many topics ? (Closer to 0 = better)')
plt.show()

In [0]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=8, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [0]:
vis = gensimvis.prepare(topic_model=lda_model_tfidf, corpus=corpus_tfidf, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [0]:
#Scoring topics on documents

In [0]:
for i in range(0,3):
    print('### Scoring the document', i)
    for index, score in sorted(lda_model[bow_corpus[i]], key=lambda tup: -1*tup[1]):
        print("Score: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 5)), '\n')

In [0]:
for i in range(0,3):
    print('### Scoring the document', i)
    for index, score in sorted(lda_model_tfidf[bow_corpus[i]], key=lambda tup: -1*tup[1]):
        print("Score: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)), '\n')

In [0]:
#New reviews testing

In [0]:
def preprocess(raw_text):
    x = tokenize(raw_text)
    x = removeSW(x)
    x = lemma(x)
    return x

In [0]:
unseen_document = ['I had to wait 2 hours. It was so long. I will never come back here.', 'Food was horrible. My pizza was burn and was late.', 'It tasted bad. It is not good quality. Who is cooking here ?']
preprocessed_doc = preprocess(unseen_document)
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_doc]

print('There is', len(preprocessed_doc), 'reviews in the unseen document. We are going to calculate the score for the best topic of each of them.\n---------------\n')

for i in range(0, len(preprocessed_doc)):
    for index, score in sorted(lda_model[bow_corpus[i]], key=lambda tup: -1*tup[1]):
        print("# Review", i, ": best score: {}\t For topic: {}".format(score, lda_model.print_topic(index, 5)), '\n')
        break

In [0]:
nltk.download()

In [0]:
review.show(10)

In [0]:
df=review.toPandas()

In [0]:
from gensim import models

In [0]:
#BIGRAMS AND TRIGRAMS
bigrams_phrases=gensim.models.Phrases(texts_lem,min_count=5,threshold=100)
trigram_phrases=gensim.models.Phrases(bigrams_phrases[texts_lem],threshold=100)

bigram=gensim.models.phrases.Phraser(bigrams_phrases)
trigram=gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(bigram[doc] for doc in texts)

def make_trigrams(texts):
    return(trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(texts_lem)
data_bigrams_trigrams = make_trigrams(texts_lem)

#print(data_bigrams_trigrams)





In [0]:
data_bigrams_trigrams=list(data_bigrams_trigrams)
data_bigrams_trigrams



In [0]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel
from gensim import corpora

id2word=corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus =[id2word.doc2bow(text) for text in texts]

print(corpus[0][0:20])

tfidf=TfidfModel(corpus,id2word=id2word)

low_value = 0.03

words = []

words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]         
    corpus[i] = new_bow


In [0]:
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=30,
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=100,
#                                            alpha="auto")

In [0]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=id2word, passes=100,random_state=100, workers=4,chunksize=100)

In [0]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [0]:
id2word.filter_extremes(no_below=15, no_above=0.5)
len(dictionary.cfs)

In [0]:
vis = gensimvis.prepare(topic_model=lda_model, corpus=corpus, dictionary=id2word)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [0]:
dictionary = gensim.corpora.Dictionary(corpus)


In [0]:
!pip install pyLDAvis

In [0]:
!pip uninstall pandas
!pip install pandas==1.1.5

In [0]:
#from pyLDAvis import gensim
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds' , R=30)
vis

In [0]:
vis = gensimvis.prepare(topic_model=lda_model, corpus=corpus,id2word=id2word,mds="mmds",R=30,dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)