# <font color = 'dodgerblue'>**Tokenization approaches spacy - Real Dataset**

# <font color = 'dodgerblue'>**Install/Import Libraries**

In [None]:
!pip install -U spacy



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pathlib import Path
import tarfile
import pandas as pd
import spacy

In [None]:
spacy.__version__

'3.4.1'

# <font color = 'dodgerblue'>**Specify Data Folders**

In [None]:
# specify paths to save and download data
base_path = '/home/harpreet/Insync/google_drive_shaannoor/data'
#base_path = '/content/drive/MyDrive/datasets'
base_folder = Path(base_path)
archive_folder = base_folder/'archive'
data_folder = base_folder/'datasets'

# <font color = 'dodgerblue'>**Download Data**

## <font color = 'dodgerblue'>**Step1: use wget to download data files from URl**

In [None]:
# complete data link: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

url='https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
file = archive_folder/'aclImdb_v1.tar.gz'
!wget {url} -P {archive_folder} -O {file}

--2022-08-28 11:39:50--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘/home/harpreet/Insync/google_drive_shaannoor/datasets/archive/aclImdb_v1.tar.gz’


2022-08-28 11:39:54 (21.0 MB/s) - ‘/home/harpreet/Insync/google_drive_shaannoor/datasets/archive/aclImdb_v1.tar.gz’ saved [84125825/84125825]



## <font color = 'dodgerblue'>**Step2: check content of folder where data was downloaded**

In [None]:
# list files of google drive where data was downloaded
for entries in archive_folder.iterdir():
  if 'tar' in  entries.name:
    print(entries.name)


20news-bydate.tar.gz
scale_whole_review.tar.gz
aclImdb_v1.tar.gz
review_polarity.tar.gz


## <font color = 'dodgerblue'>**Step3: Check content of zipped/tar folder**

In [None]:
# create a pathlib object for the file we want to untar
file = archive_folder /'aclImdb_v1.tar.gz'

In [None]:
# Extract files using tarfile library 
# this cell will take up to 25 minutes

with  tarfile.open(file, 'r') as tar:
  tar_file_names = tar.getnames()


In [None]:
tar_file_names[0:10]

['aclImdb',
 'aclImdb/test',
 'aclImdb/train',
 'aclImdb/test/neg',
 'aclImdb/test/pos',
 'aclImdb/train/neg',
 'aclImdb/train/pos',
 'aclImdb/train/unsup',
 'aclImdb/imdbEr.txt',
 'aclImdb/imdb.vocab']

## <font color = 'dodgerblue'>**Step 4: unzip/untar files**

In [None]:
file = archive_folder/'aclImdb_v1.tar.gz'
with tarfile.open(file, 'r') as tar:
    tar.extractall(path = data_folder)

## <font color = 'dodgerblue'>**Step 5: Understand the structure of unzipped folder**

In [None]:
# we will use rglob which will help us to specify the pattern to search 
# ** - Recursively matches zero or more directories that fall under the current directory.

for entries in (data_folder/'aclImdb').rglob('**'):
  print(entries)

/home/harpreet/Insync/google_drive_shaannoor/datasets/data/aclImdb
/home/harpreet/Insync/google_drive_shaannoor/datasets/data/aclImdb/train
/home/harpreet/Insync/google_drive_shaannoor/datasets/data/aclImdb/train/neg
/home/harpreet/Insync/google_drive_shaannoor/datasets/data/aclImdb/train/pos
/home/harpreet/Insync/google_drive_shaannoor/datasets/data/aclImdb/train/unsup
/home/harpreet/Insync/google_drive_shaannoor/datasets/data/aclImdb/test
/home/harpreet/Insync/google_drive_shaannoor/datasets/data/aclImdb/test/neg
/home/harpreet/Insync/google_drive_shaannoor/datasets/data/aclImdb/test/pos


## <font color = 'dodgerblue'>**Step 6a: combine all text files and create dataframe**

In [None]:
# Function to combine reviews from multiple text files
# the conepts were covered in first lecture

def get_reviews(path):
  reviews = []
  for file in path.iterdir():
    
    # check if the file is a text file
    if file.suffix == '.txt':
      # We can open files and read or write their contents using open() function
      # The files are opened in read-only mode for reading content
      with open(path/file,'r') as f:
        # We store our text from the files into the positive_reviews list as an element in our list
        text = f.read()
        # append the review to the list
        reviews.append(text)
  return reviews

In [None]:
# Function to create dataframe from extracted list of files

def make_dataframe(folder):
  
  positive_reviews = get_reviews(folder / 'pos')
  negative_reviews = get_reviews(folder / 'neg')
  # In the review column, we have positive reviews followed by negative reviews
  # so when we create labels column - we first generate list which contains string 1 
  # we repeat 1 as amany time as the length of positive_reviews 
  # similarly we generate a second string which contains 0's ).
  # We finally concatenate these two strings and use list finction to convert it to a list
  data = pd.DataFrame({'Reviews':positive_reviews + negative_reviews,
                                'Labels':list('1' * len(positive_reviews) + '0' * len(negative_reviews))})
  # We want our labels to be int32 type, we will change that here
  data.astype({'Labels':'int32'}).dtypes
  return data

In [None]:
# create a train data set
train_data = make_dataframe(data_folder/'aclImdb/train')

In [None]:
# create a test data set
test_data = make_dataframe(data_folder/'aclImdb/test')

### <font color = 'dodgerblue'>**Save dataframe to csv file**

In [None]:
train_data.to_csv(data_folder/'aclImdb/'/'train.csv')

In [None]:
test_data.to_csv(data_folder/'aclImdb'/'test.csv')

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Reviews  25000 non-null  object
 1   Labels   25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


# <font color = 'dodgerblue'>**Load csv file**

In [None]:
train_data = pd.read_csv(data_folder/ 'aclImdb'/'train.csv', index_col=0)

In [None]:
# Printing shape of dataframe
train_data.shape

(25000, 2)

In [None]:
# diaplay first five rows
train_data.head()

Unnamed: 0,Reviews,Labels
0,Ever wanted to know just how much Hollywood co...,1
1,The movie itself was ok for the kids. But I go...,1
2,You could stage a version of Charles Dickens' ...,1
3,this was a fantastic episode. i saw a clip fro...,1
4,and laugh out loud funny in many scenes.<br />...,1


# <font color = 'dodgerblue'>**Import Spacy Model**

In [None]:
# check the models we have dowloaded in spacy folder
!python -m spacy download en_core_web_sm

2022-08-28 12:33:05.114326: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-28 12:33:05.114996: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-28 12:33:05.115384: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
Collecting en-core-web-sm==3.4.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# <font color = 'dodgerblue'>**Compare tokenization approaches**

In [None]:
# We will load the model that we saved in previous lecture
# we will use en_core_web_sm
nlp = spacy.load('en_core_web_sm')

## <font color = 'dodgerblue'>**Method1 : Typical approach using spacy**

In [None]:
# generating list of tokens
def tokenize(text:str=None):
    doc = nlp(text)
    tokens = [token.text for token in doc]  
    return tokens   

In [None]:
%%timeit -r1
# DONOT RUN THIS Cell in the class
# it is only for demonstration purpose that it can take a long time
# as indicated by the output below- 
# it took around 8 minutes on a 128 gb RAM machine
# it took 21 minutes on colab
train_data['tokens_method1'] = train_data['Reviews'].apply(tokenize)

8min 15s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
train_data.head()

Unnamed: 0,Reviews,Labels,tokens_method1,tokens_method2,tokens_method3
0,Ever wanted to know just how much Hollywood co...,1,"[Ever, wanted, to, know, just, how, much, Holl...","[Ever, wanted, to, know, just, how, much, Holl...","[Ever, wanted, to, know, just, how, much, Holl..."
1,The movie itself was ok for the kids. But I go...,1,"[The, movie, itself, was, ok, for, the, kids, ...","[The, movie, itself, was, ok, for, the, kids, ...","[The, movie, itself, was, ok, for, the, kids, ..."
2,You could stage a version of Charles Dickens' ...,1,"[You, could, stage, a, version, of, Charles, D...","[You, could, stage, a, version, of, Charles, D...","[You, could, stage, a, version, of, Charles, D..."
3,this was a fantastic episode. i saw a clip fro...,1,"[this, was, a, fantastic, episode, ., i, saw, ...","[this, was, a, fantastic, episode, ., i, saw, ...","[this, was, a, fantastic, episode, ., i, saw, ..."
4,and laugh out loud funny in many scenes.<br />...,1,"[and, laugh, out, loud, funny, in, many, scene...","[and, laugh, out, loud, funny, in, many, scene...","[and, laugh, out, loud, funny, in, many, scene..."


## <font color = 'dodgerblue'>**Method 2: Using nlp.pipe from Spacy**

In [None]:
%%timeit -r1
## DO NOT Run this cell in the class
# It took around 10 minutes
# spaCy includes built-in support for multiprocessing with nlp.pipe
# this can speed up the processing
# it took 1 min 42 secs on a 128 gb RAM machine with 16 cores
# it took 15 mins on colab (colab has 4 cores)

tokens_method2 = []

for doc in nlp.pipe(train_data.Reviews.values, batch_size=1000, n_process=-1):
    tokens = [token.text for token in doc] 
       
    tokens_method2.append(tokens)

train_data['tokens_method2'] = tokens_method2

1min 36s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
train_data.head()

Unnamed: 0,Reviews,Labels,tokens_method1,tokens_method2
0,Ever wanted to know just how much Hollywood co...,1,"[Ever, wanted, to, know, just, how, much, Holl...","[Ever, wanted, to, know, just, how, much, Holl..."
1,The movie itself was ok for the kids. But I go...,1,"[The, movie, itself, was, ok, for, the, kids, ...","[The, movie, itself, was, ok, for, the, kids, ..."
2,You could stage a version of Charles Dickens' ...,1,"[You, could, stage, a, version, of, Charles, D...","[You, could, stage, a, version, of, Charles, D..."
3,this was a fantastic episode. i saw a clip fro...,1,"[this, was, a, fantastic, episode, ., i, saw, ...","[this, was, a, fantastic, episode, ., i, saw, ..."
4,and laugh out loud funny in many scenes.<br />...,1,"[and, laugh, out, loud, funny, in, many, scene...","[and, laugh, out, loud, funny, in, many, scene..."


## <font color = 'dodgerblue'>**Method 3: Using nlp.pipe and diable not required components**

In [None]:
%%timeit -r1
# in addition to multiprocessing with nlp.pipe
# we can get significant speed improvements if we disable the components that we do not need
# it took around 3 minutes
# it took 26 secs on a 128 gb RAM machine with 16 cores
# 1 min 28 secs on colab

token_list_method3 = []
disabled = nlp.select_pipes(disable= ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
for doc in nlp.pipe(train_data.Reviews.values, batch_size=1000, n_process=-1):
    tokens = [token.text for token in doc]         
    token_list_method3.append(tokens)
train_data['tokens_method3'] = token_list_method3
disabled.restore()

24.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
train_data.head()

Unnamed: 0,Reviews,Labels,tokens_method1,tokens_method2,tokens_method3
0,Ever wanted to know just how much Hollywood co...,1,"[Ever, wanted, to, know, just, how, much, Holl...","[Ever, wanted, to, know, just, how, much, Holl...","[Ever, wanted, to, know, just, how, much, Holl..."
1,The movie itself was ok for the kids. But I go...,1,"[The, movie, itself, was, ok, for, the, kids, ...","[The, movie, itself, was, ok, for, the, kids, ...","[The, movie, itself, was, ok, for, the, kids, ..."
2,You could stage a version of Charles Dickens' ...,1,"[You, could, stage, a, version, of, Charles, D...","[You, could, stage, a, version, of, Charles, D...","[You, could, stage, a, version, of, Charles, D..."
3,this was a fantastic episode. i saw a clip fro...,1,"[this, was, a, fantastic, episode, ., i, saw, ...","[this, was, a, fantastic, episode, ., i, saw, ...","[this, was, a, fantastic, episode, ., i, saw, ..."
4,and laugh out loud funny in many scenes.<br />...,1,"[and, laugh, out, loud, funny, in, many, scene...","[and, laugh, out, loud, funny, in, many, scene...","[and, laugh, out, loud, funny, in, many, scene..."
