<a href="https://colab.research.google.com/github/Madushani-Weerasekara/Colloborative-Filtering-Recommendation-Engine/blob/main/nlp_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import needed libraries

In [47]:
# import necesary libraries
! pip install torchtext
! pip install --upgrade torchtext

import pandas as pd
import numpy as np
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler




# mount google drive

In [48]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# load the dataset

In [49]:
# define the file path
file_path = '/content/drive/MyDrive/Colab Notebooks/GitHub Projects/Colloborative-Filtering-Recommendation-Engine/Data/user_reviews_edited.csv'

In [50]:
# lood the dataset
df = pd.read_csv(file_path)


In [51]:
df.head(5)

Unnamed: 0,Review ID,User ID,Location,Rating,Review
0,1,1,Yala,5,Amazing wildlife experience
1,2,1,Horton Plains,4,Beautiful hiking trails
2,3,2,Kandy,4,Rich history and culture
3,4,2,Sigiriya,5,Stunning ancient rock fortress
4,5,3,Hikkaduwa,3,Nice beaches but can get crowded


In [52]:
df.shape

(10, 5)

In [53]:
df.isnull().sum()

Unnamed: 0,0
Review ID,0
User ID,0
Location,0
Rating,0
Review,0


In [54]:
df.columns

Index(['Review ID', 'User ID', 'Location', 'Rating', 'Review '], dtype='object')

In [55]:
df_drop = df.drop(columns='Rating')



In [56]:
df_drop.columns

Index(['Review ID', 'User ID', 'Location', 'Review '], dtype='object')

In [57]:
df_drop.columns.str.strip() # Remove white space of the columns

Index(['Review ID', 'User ID', 'Location', 'Review'], dtype='object')

In [58]:
df_drop['Review'] = df['Review ']

In [59]:
df_drop.columns

Index(['Review ID', 'User ID', 'Location', 'Review ', 'Review'], dtype='object')

In [60]:
new_df = df_drop.drop(columns='Review ')

In [61]:
new_df.columns

Index(['Review ID', 'User ID', 'Location', 'Review'], dtype='object')

In [62]:
new_df['Review'] = new_df['Review'].str.lower()

In [63]:
new_df['Review']

Unnamed: 0,Review
0,amazing wildlife experience
1,beautiful hiking trails
2,rich history and culture
3,stunning ancient rock fortress
4,nice beaches but can get crowded
5,lovely beach and great food
6,incredible safari and animal sightings
7,beautiful scenery and tea plantations
8,a great place for nature lovers
9,interesting elephant orphanage


In [64]:
# label the reviews as good or bad
# define regex pattern for the keywords
regex_pattern = r'amazing|beautiful|nice|lovely|incredible|great|interesting|stunning'
new_df['label'] = np.where(new_df['Review'].str.contains(regex_pattern, case=False), 1,0)

# create a custom dataset class

In [65]:
# Dataset class for handling the reviews and labels
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        return review, label


# Tokenize the reviews and create vocabulary

In [67]:
# tokenization and vocabulary

"""! pip install torchtext
! pip install --upgrade torchtext
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter
"""

tokenizer = get_tokenizer('basic_english')
counter = Counter()

for review in new_df['Review']:
  counter.update(tokenizer(review))

#create a vocabulary from counter object
vocab = build_vocab_from_iterator(counter, specials=["<unk>"]) # <unk> for unknown words.

# Set the default index for unknown words
vocab.set_default_index(vocab["<unk>"])

def encode_review(review):
  return [vocab[token] for token in tokenizer(review)]


# Prepare data for training

In [69]:
# encode reviews and prepare dataset
encoded_reviews = [encode_review(review) for review in new_df['Review']]
max_length = max(len(review) for review in encoded_reviews)
padded_reviews = [review + [0] * (max_length - len(review)) for review in encoded_reviews]

labels = new_df['label'].values
dataset = ReviewDataset(padded_reviews, labels)

#split the dataset
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)