In [32]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import torch
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
import re # for using regular expressions(regex)
import string # Using this for preprocessing and cleaning the text (first time using it;))
import pandas as pd

#### Preprocessing

In [8]:
# Loading dataset
total_df = pd.read_csv(r'C:\Users\Lenovo\Desktop\my_random_projects\RNN\Comments\CommentsApril2017.csv', low_memory=False)
total_df.head()

Unnamed: 0,approveDate,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,parentID,...,userLocation,userTitle,userURL,inReplyTo,articleID,sectionName,newDesk,articleWordCount,printPage,typeOfMaterial
0,1491245186,This project makes me happy to be a 30+ year T...,22022598.0,22022598,<br/>,comment,1491237000.0,1,False,0.0,...,"Riverside, CA",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
1,1491188619,Stunning photos and reportage. Infuriating tha...,22017350.0,22017350,,comment,1491180000.0,1,False,0.0,...,<br/>,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
2,1491188617,Brilliant work from conception to execution. I...,22017334.0,22017334,<br/>,comment,1491179000.0,1,False,0.0,...,Raleigh NC,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
3,1491167820,NYT reporters should provide a contributor's l...,22015913.0,22015913,<br/>,comment,1491150000.0,1,False,0.0,...,"Missouri, USA",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
4,1491167815,Could only have been done in print. Stunning.,22015466.0,22015466,<br/>,comment,1491147000.0,1,False,0.0,...,"Tucson, Arizona",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News


In [9]:
# Only keeping the comment column
df = total_df[['commentBody']]
df.dtypes

commentBody    object
dtype: object

#### Data Cleaning

In [10]:
# Checking for null values
df['commentBody'].isna().sum()
# No null values;)

np.int64(0)

In [11]:
# lowercasing 
df.loc[:,'commentBody'] = df['commentBody'].str.lower() # : tells to assign it to all rows for the commentBody

In [12]:
# removing punctuations
df['commentBody'] = df['commentBody'].str.replace(f"[{string.punctuation}]", "", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['commentBody'] = df['commentBody'].str.replace(f"[{string.punctuation}]", "", regex=True)


In [13]:
df

Unnamed: 0,commentBody
0,this project makes me happy to be a 30 year ti...
1,stunning photos and reportage infuriating that...
2,brilliant work from conception to execution iv...
3,nyt reporters should provide a contributors li...
4,could only have been done in print stunning
...,...
243827,sorry but pudding has nothing to do with it de...
243828,while it would be quite punny to spell it dess...
243829,see above comments deserts is the proper
243830,john rubinstein had two brief scenes with joan...


In [14]:
# removing numbers
df['commentBody'] = df['commentBody'].apply(lambda x: re.sub(r'\b\d+\b', '<NUM>', str(x))) # r'\b\d+\b':This matches digits only when they stand alone.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['commentBody'] = df['commentBody'].apply(lambda x: re.sub(r'\b\d+\b', '<NUM>', str(x))) # r'\b\d+\b':This matches digits only when they stand alone.


In [15]:
df

Unnamed: 0,commentBody
0,this project makes me happy to be a <NUM> year...
1,stunning photos and reportage infuriating that...
2,brilliant work from conception to execution iv...
3,nyt reporters should provide a contributors li...
4,could only have been done in print stunning
...,...
243827,sorry but pudding has nothing to do with it de...
243828,while it would be quite punny to spell it dess...
243829,see above comments deserts is the proper
243830,john rubinstein had two brief scenes with joan...


#### Train & Test split

In [16]:
# split off test set first
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# split remaining into train and validation (e.g., 80/20 of remaining)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

#### Tokenization

In [17]:
tokenizer = get_tokenizer("basic_english")

In [26]:
# tokenize each row
df['tokens'] = df['commentBody'].apply(tokenizer)

# build vocab
vocab = build_vocab_from_iterator(df['tokens'], specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])  # unknown words map to <unk>

# numericalize each token list
df['ids'] = df['tokens'].apply(lambda tokens: [vocab[token] for token in tokens])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['commentBody'].apply(tokenizer)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ids'] = df['tokens'].apply(lambda tokens: [vocab[token] for token in tokens])


In [36]:
# Padding, since LSTM (RNN-based models requires fixed-length inputs.)
seqs = [torch.tensor(x) for x in df['ids']]  # Convert each list to a tensor

padded_seqs = pad_sequence(seqs, batch_first=True, padding_value=0)

In [37]:
padded_seqs

tensor([[   14,  1900,   284,  ...,     0,     0,     0],
        [ 5001,  3487,     3,  ...,     0,     0,     0],
        [ 1770,   125,    39,  ...,     0,     0,     0],
        ...,
        [  100,   846,   522,  ...,     0,     0,     0],
        [ 1231, 87208,    71,  ...,     0,     0,     0],
        [10994,  2781,     3,  ...,     0,     0,     0]])