In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# Import dataset
airbnb_model = pd.read_csv("C:\\Users\\RexPC\\Capstone Project\\original_imbalanced_predictive_model.csv")

In [3]:
airbnb_model.head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,W1,Target
0,0.011593,0.029103,0.007662,0.449472,0.012213,0.481985,0.005999,0.001974,459,1
1,0.011845,0.156081,0.048334,0.712077,0.039587,0.025124,0.005244,0.001709,583,1
2,0.019813,0.337741,0.012855,0.425495,0.015977,0.036216,0.07607,0.075832,424,1
3,0.018419,0.087486,0.01133,0.814291,0.017732,0.038456,0.009247,0.003039,332,1
4,0.018382,0.043339,0.012416,0.398184,0.01876,0.496244,0.009541,0.003134,337,1


In [4]:
len(airbnb_model)

799

In [5]:
X_train = airbnb_model.drop('Target', axis=1)

In [6]:
y_train = airbnb_model.Target

In [7]:
import pickle
# load testing set
with open('C:\\Users\\RexPC\\Capstone Project\\LDA Model Training\\X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
    
with open('C:\\Users\\RexPC\\Capstone Project\\LDA Model Training\\y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

In [8]:
len(X_train)

799

In [9]:
len(y_train)

799

In [10]:
len(X_test)

200

In [11]:
len(y_test)

200

### Resampling Techniques 
1. Oversample minority class
2. Undersample majority class
3. Generate synthetic sample Smote 

We will use smote technique - which the minority class is over-sampled by creating “synthetic” examples rather than by over-sampling with replacement

In [12]:
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 

Using TensorFlow backend.


In [13]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

Before OverSampling, counts of label '1': 755
Before OverSampling, counts of label '0': 44 



In [14]:
y_train = y_train_res
X_train = X_train_res

In [15]:
print("After OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("After OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

After OverSampling, counts of label '1': 755
After OverSampling, counts of label '0': 755 



In [16]:
X = np.array(X_train)

In [17]:
y = np.array(y_train)

### Logistic Regression - Predictive model

In [18]:
from sklearn.linear_model import LogisticRegression#create an instance and fit the model 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score

import pandas as pd

from sklearn import datasets, linear_model
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score # To testify result due to imbalanced problem

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [19]:
kf = KFold(5, shuffle=True, random_state=42)
lr_model_F1, lr_model_accuracy, lr_model_precision, lr_model_recall, lr_model_roc  = [], [], [], [], []
for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)
    
    # Logisitic Regression 
    # though, sklearn handles regularization by default, i will implement newton-cg to further handle L2 or no penalty
    logistic_reg_model = LogisticRegression(solver='newton-cg', fit_intercept=True).fit(X_train_scale, y_train)
    logistic_classifier = logistic_reg_model.predict(X_val_scale)
    
    lr_model_F1.append(f1_score(y_val, logistic_classifier, average='binary'))
    lr_model_accuracy.append(metrics.accuracy_score(y_val, logistic_classifier))
    lr_model_precision.append(metrics.precision_score(y_val, logistic_classifier, average='binary'))
    lr_model_recall.append(metrics.recall_score(y_val, logistic_classifier, average='binary'))
    lr_model_roc.append(roc_auc_score(y_val, logistic_classifier))

In [20]:
print("Logistic Regression Accuracy: %.3f (+/- %.3f)" % (np.mean(lr_model_accuracy), np.std(lr_model_accuracy)))
print("Logistic Regression Precision: %.3f (+/- %.3f)" % (np.mean(lr_model_precision), np.std(lr_model_precision)))
print("Logistic Regression Recall: %.3f (+/- %.3f)" % (np.mean(lr_model_recall), np.std(lr_model_recall)))
print("Logistic Regression f1 score: %.3f (+/- %.3f)" % (np.mean(lr_model_F1), np.std(lr_model_F1)))
print("Logistic Regression Roc: %.3f (+/- %.3f)" % (np.mean(lr_model_roc), np.std(lr_model_roc)))


Logistic Regression Accuracy: 0.857 (+/- 0.014)
Logistic Regression Precision: 0.871 (+/- 0.021)
Logistic Regression Recall: 0.837 (+/- 0.021)
Logistic Regression f1 score: 0.854 (+/- 0.017)
Logistic Regression Roc: 0.857 (+/- 0.014)


In [21]:
# Save the Modle to file in the current working directory
Pkl_Filename_1 = "3-LogisticRegression_Model_Smote.pkl"  

In [22]:
with open(Pkl_Filename_1, 'wb') as file:  
    pickle.dump(logistic_reg_model, file)

### Rerun model with unseen data

In [23]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['etc','give','go','to','the','this','not','of'])

In [24]:
def tokenize_word(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
def tokenize_corpus(data):
    data_words = list(tokenize_word(data))
    return data_words

In [25]:
data_words_test = tokenize_corpus(X_test.review_text.values.tolist())

In [26]:
# Build the biagram and trigram model with gensim Phrases()
bigram = gensim.models.Phrases(data_words_test, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words_test], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words_test[0]]])

['the', 'apartment', 'is', 'pretty', 'much', 'as', 'described', 'convenient', 'position', 'right', 'near', 'klcc', 'and', 'petronas_towers', 'little', 'walk', 'from', 'the', 'lrt', 'or', 'monorail', 'but', 'not', 'too', 'far', 'there', 'is', 'ample', 'food', 'and', 'restaurants', 'close', 'by', 'the', 'photos', 'were', 'more', 'or', 'less', 'accurate', 'maybe', 'there', 'was', 'bad', 'cleaner', 'before', 'came', 'but', 'did', 'not', 'feel', 'remotely', 'cozy', 'the', 'sheets', 'were', 'stained', 'and', 'had', 'long', 'black', 'hairs', 'inside', 'did', 'they', 'even', 'change', 'the', 'sheets', 'the', 'towels', 'had', 'lots', 'of', 'dirt', 'and', 'is', 'smelly', 'has', 'some', 'odour', 'on', 'it', 'the', 'bathroom', 'was', 'cleaned', 'really', 'well', 'and', 'clean', 'comfortable', 'the', 'kitchen', 'had', 'food', 'on', 'the', 'cooktop', 'and', 'drink', 'rings', 'on', 'the', 'bench', 'had', 'to', 'see', 'in', 'the', 'photos', 'as', 'the', 'bench', 'was', 'shiny', 'opened', 'the', 'kettl

In [27]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    p_stemmer = PorterStemmer()
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [28]:
from nltk.stem.porter import PorterStemmer
import spacy
nlp = spacy.load("en")
def preprocess_data(data_words):
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    return data_lemmatized

In [29]:
data_lemmatized_test = preprocess_data(data_words_test)
print(preprocess_data(data_words_test)[:1])

[['apartment', 'pretty', 'much', 'describe', 'convenient', 'position', 'right', 'klcc', 'little', 'walk', 'far', 'ample', 'food', 'restaurant', 'close', 'photo', 'less', 'accurate', 'maybe', 'bad', 'cleaner', 'come', 'feel', 'remotely', 'cozy', 'sheet', 'stain', 'long', 'black', 'hair', 'even', 'change', 'sheet', 'towel', 'lot', 'dirt', 'smelly', 'odour', 'bathroom', 'clean', 'really', 'well', 'clean', 'comfortable', 'kitchen', 'food', 'drink', 'ring', 'bench', 'see', 'photo', 'bench', 'open', 'kettle', 'boil', 'water', 'corrode', 'rusty', 'warm', 'water', 'dispenser', 'boiling', 'also', 'feel', 'sit', 'couch', 'mark', 'leg', 'sit', 'minute', 'couch', 'totally', 'fine', 'really', 'weird', 'terrible', 'location', 'great', 'lot', 'walk', 'around', 'gym', 'pool', 'level', 'quite', 'usable', 'checking', 'procedure', 'easy', 'security', 'staff', 'crazy', 'feel', 'safe', 'enough', 'street', 'dark', 'night', 'worried']]


In [30]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

# Create Lsa model
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

# Create Lda model
def create_gensim_lda_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    ldamodel = gensim.models.LdaModel(corpus = doc_term_matrix, num_topics=number_of_topics, id2word = dictionary,
                                          random_state=100, 
                                          update_every=1,
                                          chunksize=100,
                                          passes=10,
                                          alpha='auto',
                                          per_word_topics=True)  # train model
    
    print(ldamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return ldamodel

def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

def plot_graph(doc_clean, start, stop, step):
    """
    Input   : corpus : Gensim corpus
              start : Min num of topics
              stop : Max num of topics
    purpose : To plot coherence score
    Output  : coherence_values : Coherence values corresponding to the model with respective number of topics
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [31]:
dictionary_test, doc_term_matrix_test=prepare_corpus(data_lemmatized_test)

### Reload optimal LDA model

In [32]:
from gensim import corpora, models, similarities

In [33]:
optimal_model =  models.LdaModel.load('C:\\Users\\RexPC\\Capstone Project\\LDA Model Training\\lda_optimal.model')

### Build predictive input on testing set

In [34]:
test_vecs = []
for i in range(len(X_test)):
    top_topics = optimal_model.get_document_topics(doc_term_matrix_test[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(8)]
    topic_vec.extend([len(X_test.iloc[i].review_text)]) # review word count
    test_vecs.append(topic_vec)

In [35]:
test_vecs[2]

[0.022659054,
 0.21237245,
 0.052898217,
 0.5533336,
 0.023146022,
 0.120000556,
 0.011740915,
 0.003849185,
 199]

In [36]:
X2 = np.array(test_vecs)
y2 = np.array(y_test)

In [37]:
len(X2)

200

In [38]:
len(y2)

200

### load built Machine Learning model from pickle library

In [39]:
with open('C:\\Users\\RexPC\\Capstone Project\\LDA Model Training\\model\\3-LogisticRegression_Model_Smote.pkl', 'rb') as f:
    log_model1 = pickle.load(f)

### Load built classifier with unseen data

In [46]:
kf = KFold(5, shuffle=True, random_state=42)
lr_model_F1, lr_model_accuracy, lr_model_precision, lr_model_recall, lr_model_roc  = [], [], [], [], []

for train_ind, val_ind in kf.split(X2, y2):
    # Assign CV IDX
    X_train, y_train = X2[train_ind], y2[train_ind]
    X_val, y_val = X2[val_ind], y2[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)
    
    # Logisitic Regression 
    # though, sklearn handles regularization by default, i will implement newton-cg to further handle L2 or no penalty
    logistic_reg_model = log_model1.fit(X_train_scale, y_train)
    logistic_classifier = logistic_reg_model.predict(X_val_scale)
    
    lr_model_F1.append(f1_score(y_val, logistic_classifier, average='binary'))
    lr_model_accuracy.append(metrics.accuracy_score(y_val, logistic_classifier))
    lr_model_precision.append(metrics.precision_score(y_val, logistic_classifier, average='binary'))
    lr_model_recall.append(metrics.recall_score(y_val, logistic_classifier, average='binary'))
    lr_model_roc.append(roc_auc_score(y_val, logistic_classifier))

In [47]:
print("Logistic Regression Accuracy: %.3f (+/- %.3f)" % (np.mean(lr_model_accuracy), np.std(lr_model_accuracy)))
print("Logistic Regression Precision: %.3f (+/- %.3f)" % (np.mean(lr_model_precision), np.std(lr_model_precision)))
print("Logistic Regression Recall: %.3f (+/- %.3f)" % (np.mean(lr_model_recall), np.std(lr_model_recall)))
print("Logistic Regression f1 score: %.3f (+/- %.3f)" % (np.mean(lr_model_F1), np.std(lr_model_F1)))
print("Logistic Regression Roc: %.3f (+/- %.3f)" % (np.mean(lr_model_roc), np.std(lr_model_roc)))

Logistic Regression Accuracy: 0.940 (+/- 0.037)
Logistic Regression Precision: 0.940 (+/- 0.037)
Logistic Regression Recall: 1.000 (+/- 0.000)
Logistic Regression f1 score: 0.969 (+/- 0.020)
Logistic Regression Roc: 0.500 (+/- 0.000)
