## Imports

In [1]:
import nltk

In [3]:
nltk.download('all')

In [4]:
import numpy as np
import pandas as pd

In [5]:
import re

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
from sklearn.model_selection import train_test_split

## Dataset prepearing

1 - positive class \
0 - negative class

In [10]:
dataset = pd.read_csv('https://raw.githubusercontent.com/futurexskill/ml-model-deployment/main/Restaurant_Reviews.tsv.txt', delimiter="\t", quoting=3)

In [11]:
ps = PorterStemmer()

In [12]:
corpus = []
for i in range(0,1000):
  customer_review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i].lower())
  customer_review = customer_review.split()
  clean_review = [ps.stem(word) for word in customer_review if not word in set(stopwords.words("english"))]
  clean_review = ' '.join(clean_review)
  corpus.append(clean_review)

In [13]:
vectorizer = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.6)

In [14]:
X = vectorizer.fit_transform(corpus).toarray()

In [15]:
y = dataset.iloc[:, 1].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Torch Modelling

In [17]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [18]:
import torch.optim as optim

In [19]:
Xtrain_ = torch.from_numpy(X_train).float()
Xtest_ = torch.from_numpy(X_test).float()

In [20]:
ytrain_ = torch.from_numpy(y_train)
ytest_ = torch.from_numpy(y_test)

In [21]:
Xtrain_.shape, ytrain_.shape

(torch.Size([800, 467]), torch.Size([800]))

In [22]:
Xtest_.shape, ytest_.shape

(torch.Size([200, 467]), torch.Size([200]))

In [23]:
input_size = Xtrain_.shape[1] # 467
output_size = 2 # 2 classes
hidden_size = 500

In [24]:
class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.fc1 = torch.nn.Linear(input_size, hidden_size)
    self.fc2 = torch.nn.Linear(hidden_size, hidden_size)
    self.fc3 = torch.nn.Linear(hidden_size, output_size)

  def forward(self, X):
    X = torch.relu((self.fc1(X)))
    X = torch.relu((self.fc2(X)))
    X = self.fc3(X)

    return F.log_softmax(X, dim=1)

In [25]:
model = Net()

In [26]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.NLLLoss()

In [27]:
epochs = 100

In [28]:
for epoch in range(epochs):
  optimizer.zero_grad()
  Ypred = model(Xtrain_)
  loss = loss_fn(Ypred, ytrain_)
  loss.backward()
  optimizer.step()
  print(f"Epoch: {epoch}, loss: {loss.item()}")

Epoch: 0, loss: 0.6931762099266052
Epoch: 1, loss: 0.6676128506660461
Epoch: 2, loss: 0.518437385559082
Epoch: 3, loss: 0.33773723244667053
Epoch: 4, loss: 0.2016667127609253
Epoch: 5, loss: 0.1341266930103302
Epoch: 6, loss: 0.10079258680343628
Epoch: 7, loss: 0.07605332881212234
Epoch: 8, loss: 0.0645371824502945
Epoch: 9, loss: 0.049057699739933014
Epoch: 10, loss: 0.04135501757264137
Epoch: 11, loss: 0.047503501176834106
Epoch: 12, loss: 0.03359709680080414
Epoch: 13, loss: 0.03727592155337334
Epoch: 14, loss: 0.036070212721824646
Epoch: 15, loss: 0.036413565278053284
Epoch: 16, loss: 0.02981877326965332
Epoch: 17, loss: 0.03217986226081848
Epoch: 18, loss: 0.034409862011671066
Epoch: 19, loss: 0.02999388985335827
Epoch: 20, loss: 0.031606193631887436
Epoch: 21, loss: 0.03145970404148102
Epoch: 22, loss: 0.03134901449084282
Epoch: 23, loss: 0.028621504083275795
Epoch: 24, loss: 0.030525164678692818
Epoch: 25, loss: 0.030605239793658257
Epoch: 26, loss: 0.028411507606506348
Epoch: 2

## Check score

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [30]:
Ytestpred = model(Xtest_)

In [31]:
y_test_pred = np.array([int(torch.argmax(Ytestpred[i])) for i in range(len(Ytestpred))])

In [32]:
y_test_pred, y_test

(array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
        0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
        0, 1]),
 array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
        1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
        1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
        1, 0, 0, 1, 0,

In [33]:
cmknn = confusion_matrix(y_test, y_test_pred)
cmknn

array([[70, 27],
       [24, 79]])

In [34]:
print(f'Torch accuracy: {accuracy_score(y_test, y_test_pred)}')

Torch accuracy: 0.745


## Check on samples

In [35]:
sample = ["Good batting by England"]

In [36]:
sample = vectorizer.transform(sample).toarray()

In [37]:
sentiment = model(torch.from_numpy(sample).float())
sentiment

tensor([[-1.5953, -0.2267]], grad_fn=<LogSoftmaxBackward>)

In [38]:
np.array([int(torch.argmax(sentiment[i])) for i in range(len(sentiment))])

array([1])

In [39]:
sample2 = ["bad perfomance by India in the match"]
sample2 = vectorizer.transform(sample2).toarray()
sentiment2 = model(torch.from_numpy(sample2).float())
sentiment2

tensor([[  0.0000, -43.9535]], grad_fn=<LogSoftmaxBackward>)

In [40]:
np.array([int(torch.argmax(sentiment2[i])) for i in range(len(sentiment2))])

array([0])

## Save model

In [41]:
torch.save(model.state_dict(), 'text_classifier_pytorch')