<a href="https://colab.research.google.com/github/GaoangLiu/AA_ipynb/blob/master/Sentiment_Analysis_on_Movie_Reviews_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset. The train/test split has been preserved for the purposes of benchmarking, but the sentences have been shuffled from their original order. Each Sentence has been parsed into many phrases by the Stanford parser. Each phrase has a PhraseId. Each sentence has a SentenceId. Phrases that are repeated (such as short/common words) are only included once in the data.

train.tsv contains the phrases and their associated sentiment 

labels. We have additionally provided a SentenceId so that you can track which phrases belong to a single sentence.

test.tsv contains just phrases. You must assign a sentiment label to each phrase.

The sentiment labels are:

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive

Main page [https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data?select=sampleSubmission.csv](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data?select=sampleSubmission.csv)


In [0]:
import math
import re
import os
import timeit
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import logging
import time
import smart_open
import importlib

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
logging.basicConfig(format='[%(asctime)s %(levelname)8s] %(message)s', level=logging.INFO, datefmt='%m-%d %H:%M:%S')

from keras import layers, Input
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential, Model, load_model
from keras.layers import Flatten, Dense, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api

from tqdm.notebook import tqdm
import tensorflow_hub as tfh

In [0]:
! rm *.tsv *.zip
! wget -O movie.zip ali.140714.xyz:8000/sentiment_analysis.zip 
! wget -O b7.py ali.140714.xyz:8000/boost117.py
! unzip movie.zip 
! ls

# Explore data

In [0]:
pd.set_option('max_colwidth', 300)
train = pd.read_csv('train.tsv', sep="\t")
train.columns = train.columns.str.lower()
train.sample(10)

## Get hands 

NB score: 0.58055

In [0]:
class Classifier():
  def __init__(self):
    self.train = None
    self.test = None 
    self.model = None
    
  def load_data(self, train_file='train.csv', test_file='test.csv'):
      """ Load train, test csv files and return pandas.DataFrame
      """
      self.train = pd.read_csv('train.tsv', sep="\t")
      self.train.rename({'Phrase': 'text', 'Sentiment': 'target'}, axis='columns', inplace=True)
      self.test = pd.read_csv('test.tsv', sep="\t")
      self.test.rename({'Phrase': 'text', 'Sentiment': 'target'}, axis='columns', inplace=True)
      logging.info('TSV data loaded')
  
  def countvectorize(self):
    #   tv = TfidfVectorizer(ngram_range=(1,5), token_pattern=r'\w{1,}',
    #            min_df=2, max_df=0.9, strip_accents='unicode', 
    #            smooth_idf=1, sublinear_tf=1, max_features=5000)
      tv = CountVectorizer()
      tv.fit(pd.concat([self.train.text, self.test.text]))
      self.vector_train = tv.transform(self.train.text)
      self.vector_test  = tv.transform(self.test.text)
      logging.info("Train & test text tokenized")

  def train_model(self):
      # Choose your own classifier: self.model and run it
      logging.info(f"{self.__class__.__name__} starts running.")
      labels = self.train.target
      X_train, X_val, y_train, y_val = train_test_split(self.vector_train, labels, test_size=0.2, random_state=2020)
      self.model.fit(X_train, y_train)
      
      self.X_val, self.y_val = X_val, y_val 
      return self.model

  def save_predictions(self, y_preds):
      sub = pd.read_csv(f"sampleSubmission.csv")
      sub['Sentiment'] = y_preds 
      sub.to_csv(f"submission_{self.__class__.__name__}.csv", index=False)
      logging.info(f'Prediction exported to submission_{self.__class__.__name__}.csv')
  
  def pipeline(self):
      s_time = time.clock()
      self.load_data()
      self.countvectorize()
      self.build_model()
      self.train_model()
      logging.info(f"Program running for {time.clock() - s_time} seconds")

class C_Bayes(Classifier):
  def build_model(self):
      self.model = MultinomialNB()
      return self.model

class C_SVM(Classifier):
    def build_model(self):
        self.model = svm.SVC(n_jobs=-1, verbose=True)
        return self.model

class C_LR(Classifier):
    def build_model(self):
        self.model = LogisticRegression(n_jobs=10, verbose=1, solver='lbfgs')
        return self.model

Try SVM
Results, svm runs too slow. Quit 

In [0]:
def transfrom(text_train, text_test):
    large_use = 'https://tfhub.dev/google/universal-sentence-encoder-large/5'
    embed = tfh.load(large_use)

    vector_train = [tf.reshape(embed([line]), [-1]).numpy() for line in tqdm(text_train)]
    vector_test = [tf.reshape(embed([line]), [-1]).numpy() for line in tqdm(text_test)]

    return vector_train, vector_test

# base = Classifier()
# base.load_data()
# vector_train, vector_test = transfrom(base.train.text, base.test.text)    

Try LR
- first run produces score 0.61260, not bad

In [0]:
c = C_LR()
c.pipeline()

y_preds = c.model.predict(c.X_val)
print('Accuracy score', accuracy_score(y_preds, c.y_val))

y_preds = c.model.predict(c.vector_test)
c.save_predictions(y_preds)
y_preds 


Naive Bayes

In [0]:
c = C_Bayes()
c.pipeline()
y_preds = c.model.predict(c.vector_test)
c.save_predictions(y_preds)
y_preds

# results 0.58055

In [0]:
!ls
import b7
b7.Files().upload_vps('submission_C_LR.csv')