In [None]:
%load_ext autoreload
%autoreload 2

import ast
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.test.utils import get_tmpfile
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import json

%matplotlib inline

pd.set_option('display.max_columns', 500)

import pandas as pd 
import numpy as np 
import os
import re


## Load scrapped data

In [None]:
# Function rto read jl file
def read_jl_file(file_name):
    values = []
    with open(file_name, 'rb') as f:
        line = '---'
        while len(line)>1:
            line = f.readline()
            values.append(line)
    values = values[:-1]
    values = [json.loads(i) for i in values]
    df = pd.DataFrame(values)
    return df

# Reading file
df = read_jl_file('./all_data2.jl')

In [None]:
df.head()

## Cleaning Utilities

In [None]:
characters_to_remove = ["@", "/", "#", ".", ",", "!", "?", "(", ")", "-", "_", "\"", ":"]
transformation_dict = {initial:" " for initial in characters_to_remove}

with_accent = ['é', 'è', 'ê','à','ù','ô']
without_accent = ['e', 'e','e', 'a','u','o']
accent_dict = {before:after for before, after in zip(with_accent, without_accent)}

contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}


def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re
contractions, contractions_re = _get_contractions(contraction_dict)

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

## Preprocessing

In [None]:
def filter(x):
    y = x.translate(str.maketrans(transformation_dict))
    z = y.translate(str.maketrans(accent_dict)) 
    w = re.sub('[^A-Za-z0-9\']+', ' ', z)
    v = replace_contractions(w)
    clean_text = v.lower()
    return clean_text
df['Review_filtered'] = df.Content.apply(lambda x:filter(x))

In [None]:
df['Review_split'] = df.Review_filtered.apply(lambda x:TweetTokenizer().tokenize(str(x)))

In [None]:
lemmatizer = WordNetLemmatizer()
def stem(words):
    stem=[]
    for word, tag in nltk.pos_tag(words):
        if tag.startswith("NN"):
            stem.append(lemmatizer.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('VB'):
            stem.append(lemmatizer.lemmatize(word, wordnet.VERB))
        elif tag.startswith('JJ'):
            stem.append(lemmatizer.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('RB'):
            stem.append(lemmatizer.lemmatize(word, wordnet.ADJ))
        else:
            stem.append(word)
    return stem
df['Review_Stem'] = df.Review_split.apply(lambda x:stem(x))

In [None]:
from stop_words import get_stop_words
stop_words = get_stop_words('en')
my_stop_words = []
all_stop_words =  stop_words + my_stop_words

In [None]:
def filter_stopwords(words):
     return [word for word in words if word not in all_stop_words] 
df['Review-stopword'] = df.Review_Stem.apply(lambda x:filter_stopwords(x))

In [None]:
df.head()

## Checkpoint of preprocessing: persist to HardDisk

In [None]:
df.to_csv("Preprocessed_reviews.csv")

## Load preprocessed df from HDD

In [None]:
df = pd.read_csv("Preprocessed_reviews.csv")

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df = df.loc[:,['Review_filtered', 'Rating', 'Review-stopword']]

In [None]:
reviews = df['Review-stopword'].to_list()
reviews= [review[2:-2].split('\', \'') for review in reviews]

In [None]:
def finalize_cleaning(reviews):
    reviews = [re.sub(r"\`+", "", word) for word in reviews]
    reviews = [re.sub(r"\.+", "", word) for word in reviews]
    reviews = [re.sub(r"\.\w", "", word) for word in reviews]
    reviews = [re.sub(r"\'", "", word) for word in reviews]
    reviews = [re.sub(r" ", "", word) for word in reviews]
    reviews = [re.sub(r"\"", "", word) for word in reviews]
    reviews = [re.sub(r"'", "", word) for word in reviews]
    reviews = [word for word in reviews if len(word)>1]
    return reviews

final_reviews = [finalize_cleaning(review) for review in reviews]

In [None]:
# Remove numbers, but not words that contain numbers.
final_reviews = [[token for token in doc if not token.isnumeric()] for doc in final_reviews]

# Remove words that are only one character.
final_reviews = [[token for token in doc if len(token) > 1] for doc in final_reviews]

In [None]:
# dimensionality check
print(len(final_reviews))
print(final_reviews[0])

## ALTERNATIVE SOURCE (Capgemini)

In [None]:
df = pd.read_csv("clean_full.csv.gzip", ### path to your dataset
                       compression='gzip',
                       low_memory=False,
                       parse_dates=['review_date', 'review_date_diner'])

In [None]:
df = df[df.grp == "cap"]
df = df.loc[:, ['review_rating', 'review_content', 'review_content_clean']]

In [None]:
reviews = df['review_content_clean']

In [None]:
corpus = reviews.map(lambda review: ast.literal_eval(review)).tolist()

In [None]:
def finalize_cleaning(reviews):
    reviews = [re.sub(r"\`+", "", word) for word in reviews]
    reviews = [re.sub(r"\.+", "", word) for word in reviews]
    reviews = [re.sub(r"\.\w", "", word) for word in reviews]
    reviews = [re.sub(r"\'", "", word) for word in reviews]
    reviews = [word for word in reviews if len(word)>1]
    return reviews
corpus = list(map(lambda review: finalize_cleaning(review), corpus))

In [None]:
print(len(corpus))
print(corpus[0])
print(df.iloc[0,1])

## data ready:
### 1. our own data: final_reviews
### 2. Capgemini data: corpus

## BOW REPRESENTATION

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(final_reviews)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=5, no_above=0.5)

In [None]:
# Bag-of-words representation of the documents.
bow_corpus = [dictionary.doc2bow(doc) for doc in final_reviews]

In [None]:
print('Capacity of VOCAB: %d' % len(dictionary))
print('Number of documents: %d' % len(bow_corpus))

## TF-IDF

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
len(corpus_tfidf)

## LDA (Incomplete/topic extraction)

In [None]:
# Train LDA model.
from gensim.models import LdaModel,LdaMulticore

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus_tfidf,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every)

In [None]:
from pprint import pprint
pprint(model.print_topics())

In [None]:
vector = model[corpus_tfidf]
topic_list = []
for topics in vector:
    if(len(topics) == 0):
        topic_list.append(10)
    else:
        dt = 10
        score = 0
        for topic in topics:
            if topic[1] > score:
                score = topic[1]
                dt = topic[0]
        topic_list.append(dt)

In [None]:
model.get_document_topics(corpus_tfidf[2])

In [None]:
df['dominant_topic'] = topic_list

In [None]:
df['dominant_topic'].hist()

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

from stop_words import get_stop_words
stop_words = get_stop_words('en')
my_stop_words = []
all_stop_words =  stop_words + my_stop_words

cloud = WordCloud(stopwords=all_stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

## Feature extration Method 1: LSI


In [None]:
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel

temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

#LSImodel = LsiModel(corpus_tfidf, id2word=id2word, num_topics=300)
LSImodel = LsiModel.load(r'LSI.model')
LSI_corpus = LSImodel[corpus_tfidf]

In [None]:
len(LSI_corpus)

In [None]:
LSI_features = np.zeros([488413,300])

In [None]:
for i, row_list in enumerate(LSI_corpus):
    if i % 2000 == 0:
        print(i)
    for j, w in row_list:
        topic_weights[i,j] = w 
    
#LSI_features = pd.DataFrame(topic_weights).fillna(0).values

In [None]:
LSI_features.shape

### CHECK POINT (Save model and features on HDD)

In [None]:
#LSImodel.save(r'LSI.model')
np.save(r'LSI_features', LSI_features)

In [None]:
topic_num = df['Rating'].values.astype(np.int)

In [None]:
topic_num.shape

In [None]:
np.save(r'y', topic_num)

## Feature extration Method2: WORD2VEC

In [None]:
import ast
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
from gensim.test.utils import get_tmpfile
path = get_tmpfile("word2vec.model")

model = gensim.models.Word2Vec(size=300, window=3, min_count=5, workers=4, seed=1, iter=50)

model.build_vocab(final_reviews)
model.train(final_reviews, total_examples=model.corpus_count, epochs=model.iter)
model.save("word2vec.model")

In [None]:
model = gensim.models.Word2Vec.load("word2vec.model")

In [None]:
model.wv.most_similar("good", topn=10)

In [None]:
embedding_matrix = dict()

for word in model.wv.vocab.keys():
    embedding_matrix[word] = list(model.wv[word])
    
embedding_matrix = pd.DataFrame(embedding_matrix)

In [None]:
vectors = []
for review_content in final_reviews:
    review_vector = []
    for word in review_content:
        try:
            review_vector.append(list(model.wv[word]))
        except KeyError:
            pass            
    vectors.append([sum(i) for i in zip(*review_vector)])

In [None]:
review_embedding = pd.DataFrame(vectors)
review_embedding = review_embedding/300
review_embedding.columns = ["Dimension_"+str(i) for i in range(300)]

### CHECK POINT (Save model and features on HDD)

In [None]:
np.save(r'WV_features', review_embedding.fillna(0).values)

## Feature extraction Method3: FAST TEXT

In [None]:
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath

In [None]:
modelFT = FT_gensim(size=300)

# build the vocabulary
modelFT.build_vocab(sentences=final_reviews)

# train the model
modelFT.train(
    sentences=final_reviews, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words
)

In [None]:
modelFT = FT_gensim.load(r'FT.model')

In [None]:
FT_features = np.zeros([488413,300])

In [None]:
for I, review_content in enumerate(final_reviews):
    if I % 1000 == 0:
        print(I)
    features = np.zeros([300])
    for word in review_content:
        try:
            features += list(modelFT.wv[word])
        except KeyError:
            pass            
    FT_features[I,:] = features/300

### CHECK POINT (Save model and features on HDD)

In [None]:
np.save(r'FT_features', FT_features)

In [None]:
modelFT.save(r'FT.model')