# Primary Modules

In [4]:
import pandas as pd
#Train_Test Splitting
from sklearn.model_selection import train_test_split
#Count_Vec
from sklearn.feature_extraction.text import CountVectorizer
#Seq Model
from keras.models import Sequential
#NN Layers
from keras import layers
import matplotlib.pyplot as plt
#Tokenizing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Data Set Addressing

In [9]:
#data addressing
print ("------------Choosing a Data Set and put it in Dataframe------------")

#3ple datasets
filepath_dict = {'yelp':   'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb':   'data/imdb_labelled.txt'}

print('data added successfully')
print('------------------------')

#create an empty list
df_list = []


for source, filepath in filepath_dict.items():
    #data are seperated with tab '\t'
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    #convert list to the dataframe
    df_list.append(df)

#df's concatenation
df = pd.concat(df_list)
#print(df.iloc[0])

------------Choosing a Data Set and put it in Dataframe------------
data added successfully
------------------------


# Data Frame Schema

In [8]:
#label #1 refers to positive sentiment and 
#label #0 refers to negative sentiment
#source column refers to the dataset source of the 3ple dataset
df.head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [13]:
#Baseline model defination
print ("------------Defining a Baseline Model------------")

#working on yelp dataset in a dataframe shape
#df[df['source'] refers to the source column
df_yelp = df[df['source'] == 'yelp']

#inserting sentence column values to an object
sentences = df_yelp['sentence'].values

#inserting df_yelp labels to the y var as dataset labels container
#y refers to the labels
y = df_yelp['label'].values

------------Defining a Baseline Model------------


# Train, Test data splitting, 75:25

In [15]:
#Train, Test data splitting, 75:25 ratio (normal)
#random_state used for random sampling, it's optional.
#y refers to the labels

sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

# Network Def, Train and Test Steps:
### 1)Select Model
### 2)Model add Layers (1:N)
### 3)Compile Model
### 4)Fit Model
### 5)Evaluate Model

# Training phase based on a Deep Neural Network

In [16]:
print ("create sentence and label object")
print ("split sentences_train and sentences_test")

print ("------------A Primer on (Deep) Neural Networks------------")
print ("analyze neural network parameters")
print ("analyze neural network layers")

print ("------------First Keras Model with vectorize------------")
print ("vectorize data")

#Deep Neural Net inputs will be feeds by Feature Vectors
#and Feature Vectors will be send to the Hidden Layer

#the Network needs Weight and Bias

vectorizer = CountVectorizer()

#Model fitting
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

create sentence and label object
split sentences_train and sentences_test
------------A Primer on (Deep) Neural Networks------------
analyze neural network parameters
analyze neural network layers
------------First Keras Model with vectorize------------
vectorize data


### Types of Active Functions in DNN: 
### Relu (for hidden layers), 
### Sigmoid (for binary mode), 
### Softmax (for multi class mode)

### Loss Function: crossentropy

# Model Def

### Keras Models : 1) Functional 2)Sequential 
### Text structure are Sequential 

## First: Model input dimension must be defined
### input dimension value depends on features and features vectors
## Second: Model Layers
### layers must be defined in order

In [17]:
print ("network layers modeling")

#input dimension
## Number of features
input_dim = X_train.shape[1]

#clarifying model type
model = Sequential()

#the network consists of 2 layers
#10 refers to input batch size
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

#defining learning prcoess
model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              
              #refers to the evaluation metric
              #could be prec or recall
              metrics=['accuracy'])

network layers modeling


In [18]:
input_dim 

1714

# Model Architecture

In [19]:
#Model properties
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 10)                17150     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 17,161
Trainable params: 17,161
Non-trainable params: 0
_________________________________________________________________


# Training phase Starts
### Learning process is iterative

In [20]:
history = model.fit(X_train, y_train,
                    epochs=100, #iteration number
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10) #number of sample used for each epoches

Training Accuracy: 1.0000
Testing Accuracy:  0.7960


# Model Evaluation

In [None]:
#on train set
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

#on test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

# Word Embedding
## WE: data with less dimensions but more information
#### Dense word vector: because of more information
### Best method for describing non-numeric value which has sequence order
#### mapping semantic meaning into a Geometric (Embedding Space)

# Two approach for Word Embedding Implementation
## 1) in the neural network body
## 2) pre-train (one step before learning)
### after that, it can be directly used in the model

In [21]:
print ("------------Second Keras Model with Word Embeddings------------")

#num_words refers the maximum number of words
tokenizer = Tokenizer(num_words=5000)
#tokenizer will be fit on train dataset(sentences_train)
tokenizer.fit_on_texts(sentences_train)

#texts_to_sequences
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

#len refers to the length of words
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1  

#printing sentence
print(sentences_train[2])
#printing numeric values
print(X_train[2])

for word in ['new', 'all', 'happy', 'sad']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))
    
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0, :])


------------Second Keras Model with Word Embeddings------------
Of all the dishes, the salmon was the best, but all were great.
[11, 43, 1, 171, 1, 283, 3, 1, 47, 26, 43, 24, 22]
new: 313
all: 43
happy: 320
sad: 450
[  1  10   3 282 739  25   8 208  30  64 459 230  13   1 124   5 231   8
  58   5  67   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


# pre-trained Word Embeddings
### ex: Word2Vec based on NN (Fastest and precisest)
### GloVe based on co-occurance matrix (Easier and Moderatable)

In [22]:
#pre-trained Word Embeddings section
print ("------------Third Keras Model with pre-trained Word Embeddings------------")

#clarifying model type
model = Sequential()
#embedding dimension
embedding_dim = 50
numpy_matrix = np.random.rand(1747,50)
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[numpy_matrix], 
                           input_length=maxlen, 
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

------------Third Keras Model with pre-trained Word Embeddings------------


# Model Architecture

In [23]:
#Model properties
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           87350     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 87,871
Trainable params: 87,871
Non-trainable params: 0
_________________________________________________________________


# Training and Testing Accuracy

In [25]:
history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

#on train set
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

#on test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


Training Accuracy: 1.0000
Testing Accuracy:  0.7960
