# Building Baseline model using Keras and sklearn

In [1]:
import pandas as pd

In [2]:
filepath_dict = {'yelp':   'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb':   'data/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0:5])

                                            sentence  label source
0                           Wow... Loved this place.      1   yelp
1                                 Crust is not good.      0   yelp
2          Not tasty and the texture was just nasty.      0   yelp
3  Stopped by during the late May bank holiday of...      1   yelp
4  The selection on the menu was great and so wer...      1   yelp


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer = CountVectorizer(min_df=0 , lowercase=False)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
for source in df["source"].unique():
    df_model= df[df['source']==source]

    sentences = df_model['sentence'].values
    y = df_model['label'].values
    
    sen_train, sen_test, y_train, y_test = train_test_split(sentences, y, train_size = 0.8, test_size = 0.2, random_state = 500)
    
    vectorizer = CountVectorizer()
    vectorizer.fit(sen_train)
    X_train = vectorizer.transform(sen_train)
    X_test = vectorizer.transform(sen_test)
    
    classify = LogisticRegression()
    classify.fit(X_train, y_train)
    score = classify.score(X_test, y_test)
    print (score)
    

0.8
0.81
0.7666666666666667


In [12]:
from keras.models import Sequential
from keras import layers

In [13]:
from keras.backend import clear_session
clear_session()

In [15]:
input_dim = X_train.shape[1]

model = Sequential()

model.add(layers.Dense(10, input_dim = input_dim, activation = 'relu' ))
model.add(layers.Dense(1, activation='sigmoid'))

In [16]:
model.compile(loss='binary_crossentropy', 
               optimizer='adam', 
               metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 10)                23060     
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 23,071
Trainable params: 23,071
Non-trainable params: 0
_________________________________________________________________


In [18]:
history = model.fit(X_train, y_train, epochs =50, verbose=False, validation_data = (X_test, y_test), batch_size =10)

In [85]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 1.0000
Testing Accuracy:  0.7800
