[View in Colaboratory](https://colab.research.google.com/github/Naveen131/Machine-learning/blob/master/Movie_sentiment_analysis_using_fasttext.ipynb)

In [0]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalAveragePooling1D,GlobalMaxPooling1D
from keras.models import Model
from keras.preprocessing import text, sequence
from keras import initializers, regularizers, constraints, optimizers, layers

from keras.layers import Conv1D, MaxPooling1D, Activation, GRU,LSTM
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D,concatenate
from keras.callbacks import Callback
import warnings
warnings.filterwarnings('ignore')

from keras.models import Sequential

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving sampleSubmission.csv to sampleSubmission.csv
Saving test.tsv to test.tsv
Saving train.tsv to train.tsv
User uploaded file "sampleSubmission.csv" with length 596647 bytes
User uploaded file "test.tsv" with length 3367149 bytes
User uploaded file "train.tsv" with length 8481022 bytes


In [0]:
train = pd.read_table("train.tsv")
test = pd.read_table("test.tsv")

In [8]:
!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M-subword.vec.zip

--2018-08-28 17:42:50--  https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M-subword.vec.zip
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.20.17
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.20.17|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 587879973 (561M) [application/zip]
Saving to: ‘wiki-news-300d-1M-subword.vec.zip’


2018-08-28 17:43:53 (9.01 MB/s) - ‘wiki-news-300d-1M-subword.vec.zip’ saved [587879973/587879973]



In [9]:
!unzip wiki-news-300d-1M-subword.vec.zip

Archive:  wiki-news-300d-1M-subword.vec.zip
  inflating: wiki-news-300d-1M-subword.vec  


In [10]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [0]:
import re,codecs,tqdm
def normalize(s):
    """
    Given a text, cleans and normalizes it. Feel free to add your own stuff.
    """
    s = s.lower()
    # Isolate punctuation
    s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace numbers and symbols with language
    s = s.replace('&', ' and ')
    s = s.replace('@', ' at ')
    return s
  

In [0]:
train['Phrase'] = train['Phrase'].apply(lambda x : normalize(x))
test['Phrase'] = test['Phrase'].apply(lambda x : normalize(x))

In [0]:
embeddings = "wiki-news-300d-1M-subword.vec"


In [0]:
from keras.utils import to_categorical
target = train.Sentiment.values

X_train = train['Phrase']
X_test = test['Phrase']
Y_train = to_categorical(target)


In [37]:
print(X_train.shape)
print(Y_train.shape)

(156060,)
(156060, 5)


In [0]:
max_features = 30000
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(X_train))
tokenizer.fit_on_texts(list(X_test))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train,maxlen=maxlen)
X_test = sequence.pad_sequences(X_test,maxlen=maxlen)

In [0]:
def get_coefs(word, *arr): 
  return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(embeddings))


In [0]:
word_index = tokenizer.word_index
num_words = max(max_features,len(word_index))
embedding_matrix = np.zeros((num_words,embed_size))
for word,i in word_index.items():
  if i >= max_features:
    continue;
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
    

In [0]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [41]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(128, return_sequences = True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(5, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 100, 300)     9000000     input_3[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 100, 300)     0           embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 100, 256)     329472      spatial_dropout1d_3[0][0]        
__________________________________________________________________________________________________
conv1d_3 (

In [0]:
X_train, X_val, y_train, y_val = train_test_split(X_train, Y_train, train_size=0.95, random_state=233)


In [0]:
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)


In [58]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
BATCH_SIZE = 256
EPOCHS = 5

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, 
                             mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early,RocAuc]
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,  verbose=1, 
          validation_data=(X_val,y_val),shuffle=False, callbacks=callbacks_list)

Train on 148257 samples, validate on 7803 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.31253, saving model to weights_base.best.hdf5

 ROC-AUC - epoch: 1 - score: 0.882027 

Epoch 2/5

Epoch 00002: val_loss did not improve from 0.31253

 ROC-AUC - epoch: 2 - score: 0.876846 

Epoch 3/5

Epoch 00003: val_loss did not improve from 0.31253

 ROC-AUC - epoch: 3 - score: 0.873044 

Epoch 4/5

Epoch 00004: val_loss did not improve from 0.31253

 ROC-AUC - epoch: 4 - score: 0.869096 

Epoch 5/5

Epoch 00005: val_loss did not improve from 0.31253

 ROC-AUC - epoch: 5 - score: 0.864345 



<keras.callbacks.History at 0x7fdcf6719278>

In [59]:
!ls

predictions.csv       test.tsv		      wiki-news-300d-1M-subword.vec
sample_data	      train.tsv		      wiki-news-300d-1M-subword.vec.zip
sampleSubmission.csv  weights_base.best.hdf5


In [0]:
pred = model.predict(X_test, batch_size = 1024)


In [0]:
predictions = np.round(np.argmax(pred, axis=1)).astype(int)
sub = pd.read_csv('sampleSubmission.csv', sep=",")

sub['Sentiment'] = predictions

sub.to_csv("predictions.csv", index=False)

In [62]:
!ls

predictions.csv       test.tsv		      wiki-news-300d-1M-subword.vec
sample_data	      train.tsv		      wiki-news-300d-1M-subword.vec.zip
sampleSubmission.csv  weights_base.best.hdf5


In [0]:
from google.colab import files
files.download('predictions.csv')