In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -U torchtext==0.6.0
import spacy
import pandas as pd
from torchtext.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split



In [13]:
#Downloading and Cloning INDICNLP
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor

# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1325, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178[K
Receiving objects: 100% (1325/1325), 9.57 MiB | 6.30 MiB/s, done.
Resolving deltas: 100% (688/688), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 25.36 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [14]:
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

#Load IndicNLP library
from indicnlp import loader
loader.load()

In [4]:
hindi_txt = open('/content/drive/MyDrive/processed.hi', encoding = 'utf8').read().split('\n')
english_txt = open('/content/drive/MyDrive/processed.en', encoding = 'utf8').read().split('\n')

In [5]:
hindi_txt[:10]

['आदि में परमेश्वर ने आकाश और पृथ्वी की सृष्टि की ।',
 'और पृथ्वी बेडौल और सुनसान पड़ी थी ; और गहरे जल के ऊपर अन्धियारा था : तथा परमेश्वर का आत्मा जल के ऊपर मण्डलाता था ।',
 'तब परमेश्वर ने कहा , उजियाला हो : तो उजियाला हो गया ।',
 'और परमेश्वर ने उजियाले को देखा कि अच्छा है ; और परमेश्वर ने उजियाले को अन्धियारे से अलग किया ।',
 'और परमेश्वर ने उजियाले को दिन और अन्धियारे को रात कहा । तथा सांझ हुई फिर भोर हुआ । इस प्रकार पहिला दिन हो गया । ।',
 'फिर परमेश्वर ने कहा , जल के बीच एक ऐसा अन्तर हो कि जल दो भाग हो जाए ।',
 'तब परमेश्वर ने एक अन्तर करके उसके नीचे के जल और उसके ऊपर के जल को अलग अलग किया ; और वैसा ही हो गया ।',
 'और परमेश्वर ने उस अन्तर को आकाश कहा । तथा सांझ हुई फिर भोर हुआ । इस प्रकार दूसरा दिन हो गया । ।',
 'फिर परमेश्वर ने कहा , आकाश के नीचे का जल एक स्थान में इकट्ठा हो जाए और सूखी भूमि दिखाई दे ; और वैसा ही हो गया ।',
 'और परमेश्वर ने सूखी भूमि को पृथ्वी कहा ; तथा जो जल इकट्ठा हुआ उसको उस ने समुद्र कहा : और परमेश्वर ने देखा कि अच्छा है ।']

In [6]:
english_txt[:10]

['In the beginning God created the heaven and the earth .',
 'And the earth was without form , and void ; and darkness was upon the face of the deep . And the Spirit of God moved upon the face of the waters .',
 'And God said , Let there be light : and there was light .',
 'And God saw the light , that it was good : and God divided the light from the darkness .',
 'And God called the light Day , and the darkness he called Night . And the evening and the morning were the first day .',
 'And God said , Let there be a firmament in the midst of the waters , and let it divide the waters from the waters .',
 'And God made the firmament , and divided the waters which were under the firmament from the waters which were above the firmament : and it was so .',
 'And God called the firmament Heaven . And the evening and the morning were the second day .',
 'And God said , Let the waters under the heaven be gathered together unto one place , and let the dry land appear : and it was so .',
 'And Go

In [7]:
raw_data = {'Hindi' : [line for line in hindi_txt],
            'English' : [line for line in english_txt]}

In [9]:
df = pd.DataFrame(raw_data, columns = ['Hindi', 'English'])
df[:10]

Unnamed: 0,Hindi,English
0,आदि में परमेश्वर ने आकाश और पृथ्वी की सृष्टि की ।,In the beginning God created the heaven and th...
1,और पृथ्वी बेडौल और सुनसान पड़ी थी ; और गहरे जल...,"And the earth was without form , and void ; an..."
2,"तब परमेश्वर ने कहा , उजियाला हो : तो उजियाला ह...","And God said , Let there be light : and there ..."
3,और परमेश्वर ने उजियाले को देखा कि अच्छा है ; औ...,"And God saw the light , that it was good : and..."
4,और परमेश्वर ने उजियाले को दिन और अन्धियारे को ...,"And God called the light Day , and the darknes..."
5,"फिर परमेश्वर ने कहा , जल के बीच एक ऐसा अन्तर ह...","And God said , Let there be a firmament in the..."
6,तब परमेश्वर ने एक अन्तर करके उसके नीचे के जल औ...,"And God made the firmament , and divided the w..."
7,और परमेश्वर ने उस अन्तर को आकाश कहा । तथा सांझ...,And God called the firmament Heaven . And the ...
8,"फिर परमेश्वर ने कहा , आकाश के नीचे का जल एक स्...","And God said , Let the waters under the heaven..."
9,और परमेश्वर ने सूखी भूमि को पृथ्वी कहा ; तथा ज...,And God called the dry land Earth ; and the ga...


In [18]:
train, test = train_test_split(df, test_size = 0.1)
train.to_json('train.json', orient = 'records', lines = True)
test.to_json('test.json', orient = 'records', lines = True)

In [15]:
from indicnlp.tokenize import indic_tokenize  

#Tokenizing hindi sentences
def TokenizeHindi(indic_string):
    return indic_tokenize.trivial_tokenize(indic_string)

In [16]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#Tokenizing English sentences
def TokenizeEnglish(text):
  return [token for token in word_tokenize(text)]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [21]:
hindi = Field(sequential = True, use_vocab = True, tokenize = TokenizeHindi, lower = True)
english = Field(sequential = True, use_vocab = True, tokenize = TokenizeEnglish, lower = True)

In [22]:
fields = {'Hindi': ('hi', hindi), 'English': ('eng', english)}

In [23]:
train_data, test_data = TabularDataset.splits(
    path = '',
    train = 'train.json',
    test = 'test.json',
    format = 'json',
    fields = fields
)

In [24]:
hindi.build_vocab(train_data, max_size = 10000, min_freq = 2)
english.build_vocab(train_data, max_size = 10000, min_freq = 2)

In [25]:
train_iterator, trat_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size = 64,
    device = 'cuda'
)

In [26]:
for batch in train_iterator:
  print(batch)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

[torchtext.data.batch.Batch of size 64]
	[.hi]:[torch.cuda.LongTensor of size 45x64 (GPU 0)]
	[.eng]:[torch.cuda.LongTensor of size 45x64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.hi]:[torch.cuda.LongTensor of size 62x64 (GPU 0)]
	[.eng]:[torch.cuda.LongTensor of size 57x64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.hi]:[torch.cuda.LongTensor of size 52x64 (GPU 0)]
	[.eng]:[torch.cuda.LongTensor of size 46x64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.hi]:[torch.cuda.LongTensor of size 53x64 (GPU 0)]
	[.eng]:[torch.cuda.LongTensor of size 51x64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.hi]:[torch.cuda.LongTensor of size 69x64 (GPU 0)]
	[.eng]:[torch.cuda.LongTensor of size 71x64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.hi]:[torch.cuda.LongTensor of size 52x64 (GPU 0)]
	[.eng]:[torch.cuda.LongTensor of size 51x64 (GPU 0)]

[torchtext.data.batch.Batch of size 64]
	[.hi]