<a href="https://colab.research.google.com/github/GaoangLiu/AA_ipynb/blob/master/Sentiment_Analysis_on_Movie_Reviews_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentiment Analysis on Movie Reviews Bayes:
[https://github.com/GaoangLiu/AA_ipynb/blob/master/Sentiment_Analysis_on_Movie_Reviews_Naive_Bayes.ipynb](https://github.com/GaoangLiu/AA_ipynb/blob/master/Sentiment_Analysis_on_Movie_Reviews_Naive_Bayes.ipynb)

## import packages

In [0]:
import math
import re
import os
import timeit
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import logging
import time
import smart_open
import importlib

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
logging.basicConfig(format='[%(asctime)s %(levelname)8s] %(message)s', level=logging.INFO, datefmt='%m-%d %H:%M:%S')

import keras
from keras import layers, Input
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential, Model, load_model
from keras.layers import Flatten, Dense, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api

from tqdm.notebook import tqdm
import tensorflow_hub as tfh

## Download files 

In [0]:
! rm *.tsv *.zip *.csv
! wget -O movie.zip ali.140714.xyz:8000/sentiment_analysis.zip 
! wget -O b7.py ali.140714.xyz:8000/boost117.py
! unzip movie.zip 
! ls

In [0]:
train = pd.read_csv('train.tsv', sep='\t')
train.Phrase.str.len().hist()
train.Sentiment.value_counts()

## Tune models


In [37]:
class Classifier():
  def __init__(self):
    self.train = None
    self.test = None 
    self.model = None
    
  def load_data(self, train_file='train.csv', test_file='test.csv'):
      """ Load train, test csv files and return pandas.DataFrame
      """
      self.train = pd.read_csv('train.tsv', sep="\t")
      self.train.rename({'Phrase': 'text', 'Sentiment': 'target'}, axis='columns', inplace=True)
      self.test = pd.read_csv('test.tsv', sep="\t")
      self.test.rename({'Phrase': 'text', 'Sentiment': 'target'}, axis='columns', inplace=True)
      logging.info('TSV data loaded')
  
  def save_predictions(self, y_preds):
      sub = pd.read_csv(f"sampleSubmission.csv")
      sub['Sentiment'] = y_preds 
      sub.to_csv(f"submission_{self.__class__.__name__}.csv", index=False)
      logging.info(f'Prediction exported to submission_{self.__class__.__name__}.csv')
  

class C_NN(Classifier):
    def __init__(self, max_features=100000, embed_size=128, max_len=300):
        self.max_features=max_features
        self.embed_size=embed_size
        self.max_len=max_len
    
    def tokenize_text(self, text_train, text_test):
        '''@para: max_features, the most commenly used words in data set
        @input are vector of text
        '''
        tokenizer = Tokenizer(num_words=self.max_features)
        text = pd.concat([text_train, text_test])
        tokenizer.fit_on_texts(text)

        sequence_train = tokenizer.texts_to_sequences(text_train)
        tokenized_train = pad_sequences(sequence_train, maxlen=self.max_len)
        logging.info('Train text tokeninzed')

        sequence_test = tokenizer.texts_to_sequences(text_test)
        tokenized_test = pad_sequences(sequence_test, maxlen=self.max_len)
        logging.info('Test text tokeninzed')
        return tokenized_train, tokenized_test, tokenizer
      
    def build_model(self, embed_matrix=[]):
        text_input = Input(shape=(self.max_len, ))
        embed_text = layers.Embedding(self.max_features, self.embed_size)(text_input)
        if len(embed_matrix) > 0:
            embed_text = layers.Embedding(self.max_features, self.embed_size, \
                                          weights=[embed_matrix], trainable=False)(text_input)
            
        branch_a = layers.Bidirectional(layers.GRU(32, return_sequences=True))(embed_text)
        branch_b = layers.GlobalMaxPool1D()(branch_a)

        x = layers.Dense(64, activation='relu')(branch_b)
        x = layers.Dropout(0.2)(x)

        x = layers.Dense(32, activation='relu')(branch_b)
        x = layers.Dropout(0.2)(x)
        branch_z = layers.Dense(5, activation='softmax')(x)
        
        model = Model(inputs=text_input, outputs=branch_z)
        self.model = model

        return model
        
    def embed_word_vector(self, word_index, model='glove-wiki-gigaword-100'):
        glove = api.load(model) # default: wikipedia 6B tokens, uncased
        zeros = [0] * self.embed_size
        matrix = np.zeros((self.max_features, self.embed_size))
          
        for word, i in word_index.items(): 
            if i >= self.max_features or word not in glove: continue # matrix[0] is zeros, that's also why >= is here
            matrix[i] = glove[word]

        logging.info('Matrix with embedded word vector created')
        return matrix

    def run(self, x_train, y_train):
        checkpoint = ModelCheckpoint('weights_base_best.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        early = EarlyStopping(monitor="val_acc", mode="max", patience=3)

        self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
        X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=2020)
        BATCH_SIZE = max(16, 2 ** int(math.log(len(X_tra) / 100, 2)))
        logging.info(f"Batch size is set to {BATCH_SIZE}")
        history = self.model.fit(X_tra, y_tra, epochs=30, batch_size=BATCH_SIZE, validation_data=(X_val, y_val), \
                              callbacks=[checkpoint, early], verbose=1)

        return history


c = C_NN(max_features=15000, embed_size=300, max_len=250)
c.load_data()  
labels = keras.utils.to_categorical(c.train.target, num_classes=5)      
vector_train, vector_test, tokenizer = c.tokenize_text(c.train.text, c.test.text)

embed = c.embed_word_vector(tokenizer.word_index, 'fasttext-wiki-news-subwords-300')
c.build_model(embed_matrix=embed)
c.run(vector_train, labels)
# vector_train, labels


[05-17 16:34:51     INFO] TSV data loaded
[05-17 16:34:57     INFO] Train text tokeninzed
[05-17 16:34:58     INFO] Test text tokeninzed
[05-17 16:34:58     INFO] loading projection weights from /root/gensim-data/fasttext-wiki-news-subwords-300/fasttext-wiki-news-subwords-300.gz
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
[05-17 16:40:09     INFO] loaded (999999, 300) matrix from /root/gensim-data/fasttext-wiki-news-subwords-300/fasttext-wiki-news-subwords-300.gz
[05-17 16:40:09     INFO] Matrix with embedded word vector created
[05-17 16:40:09     INFO] Batch size is set to 1024


Train on 124848 samples, validate on 31212 samples
Epoch 1/30

Epoch 00001: val_acc improved from -inf to 0.57100, saving model to weights_base_best.hdf5
Epoch 2/30

Epoch 00002: val_acc improved from 0.57100 to 0.60095, saving model to weights_base_best.hdf5
Epoch 3/30

Epoch 00003: val_acc improved from 0.60095 to 0.61502, saving model to weights_base_best.hdf5
Epoch 4/30

Epoch 00004: val_acc improved from 0.61502 to 0.62050, saving model to weights_base_best.hdf5
Epoch 5/30

Epoch 00005: val_acc improved from 0.62050 to 0.62681, saving model to weights_base_best.hdf5
Epoch 6/30

Epoch 00006: val_acc improved from 0.62681 to 0.62905, saving model to weights_base_best.hdf5
Epoch 7/30

Epoch 00007: val_acc improved from 0.62905 to 0.63033, saving model to weights_base_best.hdf5
Epoch 8/30

Epoch 00008: val_acc improved from 0.63033 to 0.63078, saving model to weights_base_best.hdf5
Epoch 9/30

Epoch 00009: val_acc improved from 0.63078 to 0.63460, saving model to weights_base_best.hdf

<keras.callbacks.callbacks.History at 0x7f6ccbbb5da0>

In [38]:
# Make predictions

model = load_model('weights_base_best.hdf5')
y_preds = model.predict(vector_test)
print("DONE Good Morning")


DONE Good Morning


In [39]:
# Export submissions to csv file
probs = np.argmax(y_preds, axis=1)
sub = pd.read_csv('sampleSubmission.csv')
sub['Sentiment'] = probs
sub['Sentiment'].value_counts()

export_file = 'submission_gru.csv'
sub.to_csv(export_file, index=False)
import b7 
b7.Files().upload_vps(export_file)
b7.Files().upload_vps('weights_base_best.hdf5')
print("DONE Good Morning")

[05-17 17:55:09     INFO] submission_gru.csv was uploaded
[05-17 17:55:13     INFO] weights_base_best.hdf5 was uploaded


DONE Good Morning
