In [3]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout

import nltk
import re
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,classification_report

In [4]:
# create a class for loading the data
class LoadData:
    def __init__(self, path):
        self.path = path

    def load_data(self):
        data = pd.read_csv(self.path)
        return data

In [11]:
# create a class for preprocessing the data
class PreprocessData:
    def __init__(self, data):
        self.data = data
        # initliaze the X and y
        self.X_final = None
        self.y_final = None
        nltk.download('stopwords')
    
    def splitting(data):
        data['label'] = data['airline_sentiment'].map({'positive':1, 'negative':0})
        X = data['text']
        y = data['label']
        return X, y
    
    def preprocessing(self):
        voc_size = 5000
        
        X, y = PreprocessData.splitting(self.data)
        
        messages = X.copy()
        ps = PorterStemmer()
        corpus = []
        
        for i in range(0, len(messages)):
            review = re.sub('[^a-zA-Z]', ' ', messages[i])
            review = review.lower()
            review = review.split()
            
            review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
            review = ' '.join(review)
            corpus.append(review)
            
        onehot_repr = [one_hot(words, voc_size) for words in corpus]
        
        sent_length = 20
        embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
        
        self.X_final = np.array(embedded_docs)
        self.y_final = np.array(y)
        
        X_train, X_test, y_train, y_test = train_test_split(self.X_final, self.y_final, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test

In [6]:
# model class
class Model:
    def __init__(self):
        self.model = None
        
    def create_model(self):
        embedding_vector_features = 40
        model = Sequential()
        model.add(Embedding(5000, embedding_vector_features, input_length=20))
        model.add(Dropout(0.3))
        model.add(Bidirectional(LSTM(100)))
        model.add(Dropout(0.3))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.model = model
        return self.model

In [7]:
# creat a class for training the model which inherits from the model class
class TrainModel(Model):
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        super().__init__()
        
    def train_model(self):
        model = TrainModel.create_model(self)
        model.fit(self.X_train, self.y_train, validation_data=(self.X_test, self.y_test), epochs=5, batch_size=64)
        return model

In [13]:
# create a class for evaluating the model
class EvaluateModel:
    def __init__(self, model, X_test, y_test):
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        
    def evaluate_model(self):
        y_pred = (self.model.predict(X_test) > 0.5).astype("int32")
        cm = confusion_matrix(self.y_test, y_pred)
        print(cm)
        print(classification_report(self.y_test, y_pred))
        print(accuracy_score(self.y_test, y_pred))

In [17]:
# implement inference class
class Inference:
    def __init__(self, model, X_test):
        self.model = model
        self.X_test = X_test
        
    def predict(self):
        y_pred = (self.model.predict(X_test) > 0.5).astype("int32")
        return y_pred

In [18]:
# call the classes
if __name__ == '__main__':
    data = LoadData('data/airline_sentiment_analysis.csv')
    data = data.load_data()
    
    preprocess = PreprocessData(data)
    X_train, X_test, y_train, y_test = preprocess.preprocessing()
    
    train = TrainModel(X_train, X_test, y_train, y_test)
    model = train.train_model()
    
    evaluate = EvaluateModel(model, X_test, y_test)
    evaluate.evaluate_model()
    
    inference = Inference(model, X_test)
    y_pred = inference.predict()
    print(y_pred)

[[1768   94]
 [ 140  307]]
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1862
           1       0.77      0.69      0.72       447

    accuracy                           0.90      2309
   macro avg       0.85      0.82      0.83      2309
weighted avg       0.90      0.90      0.90      2309

0.8986574274577739
[[0]
 [1]
 [0]
 ...
 [0]
 [0]
 [0]]
