# Review Classification

## Batching and Model Training

### Import Libraries

In [6]:
import os

In [7]:
os.getcwd()

'D:\\Jupyter work\\Natural Language processing\\Review Classification'

In [1]:
import pandas as pd # Loading data
import numpy as np
import warnings
from sklearn.model_selection import train_test_split # train test splits

warnings.filterwarnings('ignore')

### Data Loading and Processing

We will first do all the necessary pre-processing before starting to create batches and training the model. All the steps are explained in the notebook named `Text Cleaning.ipynb`

In [2]:
# Read dataset
data = pd.read_csv("Reviews.csv")
# Drop unnecesary columns and duplicates
new_data = data.drop_duplicates(subset=['UserId', 'ProfileName', 'Time', 'Text'])
# Get useful columns
useful_data = new_data[['Text', 'Score']]
# Calculate length of each sentence without tokenizer
useful_data['sudo_length'] = useful_data.Text.str.split().str.len()
# Filter examples by length
useful_data = useful_data[(useful_data.sudo_length > 20) & (useful_data.sudo_length < 100)]
# Remove length column
useful_data = useful_data.drop(['sudo_length'], axis = 1)
# print 5 rows
useful_data.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


#### Create Train and Test sets

In [8]:
train, test = train_test_split(useful_data, test_size = 0.2)
train.to_csv("./train_test_data/train.csv", index=False)
test.to_csv("./train_test_data/test.csv", index=False)

In [10]:
import torchtext
from torchtext.data import TabularDataset, Field, BucketIterator
import spacy

In [11]:
tok = spacy.load('en_core_web_sm')

In [16]:
def tokenize_en(sent):
    return [item.text for item in tok(sent)]

In [17]:
sent = "hello their, why don't u have a seat?"
tokenize_en(sent)

['hello', 'their', ',', 'why', 'do', "n't", 'u', 'have', 'a', 'seat', '?']

In [20]:
SENT_FIELD = Field(sequential=True, tokenize=tokenize_en)
LABEL_FIELD = Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

data_fields = [
    ('Text', SENT_FIELD),
    ('Score', LABEL_FIELD)
]

In [None]:
train, val = TabularDataset.splits(
    path='./train_test_data',
    train='train.csv',
    validation = 'test.csv',
    format='csv',
    skip_header=True,
    fields=data_fields
)