In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%pip install poutyne

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# %pip install --upgrade git+https://github.com/GRAAL-Research/poutyne.git@dev #install poutyne
%pip install --upgrade colorama #install colorama
%pip install --upgrade pymagnitude-light #install pymagnitude-light
%matplotlib inline

import gzip
import os
import pickle
import shutil
import warnings

import requests
import torch
import torch.nn as nn
import torch.optim as optim
from poutyne import set_seeds
from poutyne.framework import Experiment
from pymagnitudelight import Magnitude
from torch.nn.functional import cross_entropy
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pad_sequence
from torch.utils.data import DataLoader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
dimension = 300
num_layer = 1
bidirectional = False

lstm_network = nn.LSTM(
    input_size=dimension,
    hidden_size=dimension,
    num_layers=num_layer,
    bidirectional=bidirectional,
    batch_first=True,
)

In [None]:
input_dim = dimension  # the output of the LSTM
tag_dimension = 10

fully_connected_network = nn.Linear(input_dim, tag_dimension)

In [None]:
input_dim

300

In [None]:
device = torch.device("cuda:0")

batch_size = 64
lr = 0.1

epoch_number = 10

In [None]:
set_seeds(42)

In [None]:
train_data = pickle.load(open("/content/labeled_data (1).p", "rb"))  # 728,789 examples
valid_data = pickle.load(open("/content/labeled_data_valid (1).p", "rb"))  # 182,198 examples
test_data = pickle.load(open("/content/labeled_data_valid (1).p", "rb"))  # 100,000 examples

In [None]:
def download_from_url(model: str, saving_dir: str, extension: str):
    """
    Simple function to download the content of a file from a distant repository.
    """
    print("Downloading the model.")
    model_url = "https://graal.ift.ulaval.ca/public/deepparse/{}." + extension
    url = model_url.format(model)
    r = requests.get(url)

    os.makedirs(saving_dir, exist_ok=True)
    open(os.path.join(saving_dir, f"{model}.{extension}"), "wb").write(r.content)


def download_fasttext_magnitude_embeddings(saving_dir):
    """
    Function to download the magnitude pre-trained fastText model.
    """
    model = "fasttext"
    extension = "magnitude"
    file_name = os.path.join(saving_dir, f"{model}.{extension}")
    if not os.path.isfile(file_name):
        warnings.warn(
            "The fastText pre-trained word embeddings will be download in magnitude format (2.3 GO), "
            "this process will take several minutes."
        )
        extension = extension + ".gz"
        download_from_url(model=model, saving_dir=saving_dir, extension=extension)
        gz_file_name = file_name + ".gz"
        print("Unzip the model.")
        with gzip.open(os.path.join(saving_dir, gz_file_name), "rb") as f:
            with open(os.path.join(saving_dir, file_name), "wb") as f_out:
                shutil.copyfileobj(f, f_out)
        os.remove(os.path.join(saving_dir, gz_file_name))
    return file_name


class EmbeddingVectorizer:
    def __init__(self, path="./"):
        """
        Embedding vectorizer
        """
        file_name = download_fasttext_magnitude_embeddings(saving_dir=path)
        self.embedding_model = Magnitude(file_name)
    def __call__(self, addresses):
        """
        Vectorizes a list of addresses.
        """
        vectorized_addresses = []
        for address in addresses:
            embeddings = []
            for word in address.split():
                embeddings.append(self.embedding_model.query(word))
            vectorized_addresses.append(embeddings)
        return vectorized_addresses

    def __call__(self, address):
        """
        Convert address to embedding vectors
        :param address: The address to convert
        :return: The embeddings vectors
        """
        embeddings = []
        for word in address.split():
            embeddings.append(self.embedding_model.query(word))
        return embeddings


embedding_vectorizer = EmbeddingVectorizer()

In [None]:
class DatasetBucket:
    def __init__(self, data, embedding_vectorizer):
        self.data = data
        self.embedding_vectorizer = embedding_vectorizer
        self.tags_set = {
    'flat_apartment_number' : 0,
    'society_name' : 1,
    # 'building no': 2,
    'street': 2,
    'landmark': 3,
    'sub_locality' : 4,
    'area_locality_name' : 5,
    'city_town': 6,
    'pincode': 7,
    'unknown': 8
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):  # We vectorize when data is asked
        data = self.data[item]
        return self._item_vectorizing(data)

    def _item_vectorizing(self, item):
        address = item[0]
        address_vector = self.embedding_vectorizer(address)

        tags = item[1]
        idx_tags = self._convert_tags_to_idx(tags)

        return address_vector, idx_tags

    def _convert_tags_to_idx(self, tags):
        idx_tags = []
        for tag in tags:
            idx_tags.append(self.tags_set[tag])
        return idx_tags


train_dataset_vectorizer = DatasetBucket(train_data, embedding_vectorizer)
valid_dataset_vectorizer = DatasetBucket(valid_data, embedding_vectorizer)
test_dataset_vectorizer = DatasetBucket(test_data, embedding_vectorizer)

In [None]:
address, tag = train_dataset_vectorizer[0]  # Unpack the first tuple
print(f"The vectorized address is now a list of vectors {address}")

The vectorized address is now a list of vectors [array([ 0.02325456,  0.03707229,  0.0049116 ,  0.03062604,  0.08376793,
        0.00642177,  0.00135967,  0.0208587 ,  0.02357467, -0.29028088,
       -0.02337095, -0.02262662,  0.0039097 , -0.00678048, -0.06434749,
       -0.01164662,  0.02810531, -0.05848467, -0.04093806,  0.04993952,
        0.0456836 ,  0.07107708,  0.0392685 ,  0.02746471,  0.01665277,
        0.00869989,  0.04787574, -0.02994883, -0.01290582,  0.02296908,
        0.00421662, -0.01034104, -0.00485097, -0.03158264, -0.05035931,
        0.05072984, -0.01351666,  0.00195226,  0.02320314, -0.00362392,
        0.01605988, -0.02856024,  0.00963255,  0.04986977, -0.01333941,
        0.14832153,  0.01825037, -0.00837681, -0.02942097, -0.00389294,
        0.01853295,  0.04559087, -0.05780271, -0.04407778,  0.02088004,
       -0.06555015,  0.07033247, -0.02001717,  0.01117153, -0.00516496,
        0.02736741,  0.00383148,  0.00583375,  0.03039943,  0.0100781 ,
        0.11103

In [None]:
train_data[:10] 

[('10/1004  tikunijiwadi road  shubharambh  manpada  thane west    400610',
  ['flat_apartment_number',
   'street',
   'street',
   'society_name',
   'unknown',
   'city_town',
   'unknown',
   'pincode']),
 ('b-4/1  s n 7/8 tatya tope society    opp shivarkar garden  wanowrie  pune 411040  ',
  ['flat_apartment_number',
   'society_name',
   'street',
   'street',
   'society_name',
   'society_name',
   'society_name',
   'landmark',
   'landmark',
   'landmark',
   'area_locality_name',
   'city_town',
   'pincode']),
 ('2  reshma residency  paramhans ngr  paud rd  kothrud  chinchwad  411038',
  ['flat_apartment_number',
   'society_name',
   'society_name',
   'sub_locality',
   'sub_locality',
   'street',
   'street',
   'area_locality_name',
   'unknown',
   'pincode']),
 ('2  riddhiraj society  nursery road  b/h yashoda heights  gohaur baugh  bilimora  gandevi  396321',
  ['flat_apartment_number',
   'society_name',
   'society_name',
   'street',
   'street',
   'landmark',


In [None]:
def pad_collate_fn(batch):
    """
    The collate_fn that can add padding to the sequences so all can have
    the same length as the longest one.

    Args:
        batch (List[List, List]): The batch data, where the first element
        of the tuple is the word idx and the second element are the target
        label.

    Returns:
        A tuple (x, y). The element x is a tensor of packed sequence .
        The element y is a tensor of padded tag indices. The word vectors are
        padded with vectors of 0s and the tag indices are padded with -100s.
        Padding with -100 is done because of the cross-entropy loss and the
        accuracy metric ignores the targets with values -100.
    """

    # This gets us two lists of tensors and a list of integer.
    # Each tensor in the first list is a sequence of word vectors.
    # Each tensor in the second list is a sequence of tag indices.
    # The list of integer consist of the lengths of the sequences in order.
    sequences_vectors, sequences_labels, lengths = zip(
        *[
            (torch.FloatTensor(seq_vectors), torch.LongTensor(labels), len(seq_vectors))
            for (seq_vectors, labels) in sorted(
                batch, key=lambda x: len(x[0]), reverse=True
            )
        ]
    )

    # print("Sequences: ", sequences_vectors)
    # print("Labels: ", sequences_labels)

    lengths = torch.LongTensor(lengths)

    padded_sequences_vectors = pad_sequence(
        sequences_vectors, batch_first=True, padding_value=0
    )
    pack_padded_sequences_vectors = pack_padded_sequence(
        padded_sequences_vectors, lengths.cpu(), batch_first=True
    )  # We pack the padded sequence to improve the computational speed during training

    padded_sequences_labels = pad_sequence(
        sequences_labels, batch_first=True, padding_value=-100
    )

    return pack_padded_sequences_vectors, padded_sequences_labels

In [None]:
train_loader = DataLoader(
    train_dataset_vectorizer,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=pad_collate_fn,
    num_workers=2,
)
valid_loader = DataLoader(
    valid_dataset_vectorizer,
    batch_size=batch_size,
    collate_fn=pad_collate_fn,
    num_workers=2,
)
test_loader = DataLoader(
    test_dataset_vectorizer,
    batch_size=batch_size,
    collate_fn=pad_collate_fn,
    num_workers=2,
)

In [None]:
def __getitem__(self, index):
    # get the sequence and label at the specified index
    seq, label = self.sequences[index], self.labels[index]
    
    # convert the sequence to a tensor of word embeddings
    seq_vectors = torch.stack([torch.tensor(self.word_embeddings[word]) for word in seq])
    
    # pad the sequence and label
    padded_seq = pad_sequence([seq_vectors], batch_first=True)
    padded_label = pad_sequence([torch.LongTensor(label)], batch_first=True, padding_value=-1)
    
    print(f"Padded sequences size: {padded_seq.size()}")
    print(f"Padded labels size: {padded_label.size()}")
    
    return padded_seq[0], padded_label[0], len(seq_vectors)


In [None]:
subset = train_data[:2]

In [None]:
subset

[('10/1004  tikunijiwadi road  shubharambh  manpada  thane west    400610',
  ['flat_apartment_number',
   'street',
   'street',
   'society_name',
   'unknown',
   'city_town',
   'unknown',
   'pincode']),
 ('b-4/1  s n 7/8 tatya tope society    opp shivarkar garden  wanowrie  pune 411040  ',
  ['flat_apartment_number',
   'society_name',
   'street',
   'street',
   'society_name',
   'society_name',
   'society_name',
   'landmark',
   'landmark',
   'landmark',
   'area_locality_name',
   'city_town',
   'pincode'])]

In [None]:
for batch_idx, (data, target) in enumerate(train_loader):
    # target is the name of your labels tensor
    print(target.shape)

torch.Size([64, 23])
torch.Size([64, 18])
torch.Size([64, 18])
torch.Size([64, 20])
torch.Size([64, 22])
torch.Size([64, 19])
torch.Size([64, 17])
torch.Size([64, 19])
torch.Size([64, 20])
torch.Size([64, 17])
torch.Size([64, 19])
torch.Size([17, 17])


In [None]:
for batch_idx, (data, target) in enumerate(valid_loader):
    # target is the name of your labels tensor
    print(target.shape)

torch.Size([64, 22])
torch.Size([64, 19])
torch.Size([64, 20])
torch.Size([64, 20])
torch.Size([64, 23])
torch.Size([64, 19])
torch.Size([64, 15])
torch.Size([64, 19])
torch.Size([64, 17])
torch.Size([64, 17])
torch.Size([64, 19])
torch.Size([17, 16])


In [None]:
class RecurrentNet(nn.Module):
    def __init__(self, lstm_network, fully_connected_network):
        super().__init__()
        self.hidden_state = None

        self.lstm_network = lstm_network
        self.fully_connected_network = fully_connected_network

    def forward(self, packed_sequences_vectors):
        """
        Defines the computation performed at every call.

        Shapes:
            packed_sequence_vectors: batch_size * longest_sequence_length (padding), 300

        """
        # packed_sequences = pack_padded_sequence(inputs, seq_lengths, batch_first=True, enforce_sorted=False)
        lstm_out, self.hidden_state = self.lstm_network(packed_sequences_vectors)
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)

        tag_space = self.fully_connected_network(lstm_out)
        return tag_space.transpose(-1, 1)  # We need to transpose since it's a sequence


full_network = RecurrentNet(lstm_network, fully_connected_network)

In [None]:
# DONT RUN FOR BIDIRECTIONAL
#  optimizer = optim.SGD(full_network.parameters(), lr)

In [None]:
# DONT RUN FOR BIDIRECTIONAL
# exp = Experiment(
#     "./",
#     full_network,
#     device=device,
#     optimizer=optimizer,
#     loss_function=cross_entropy,
#     batch_metrics=["acc"],
# )

In [None]:
# import torch
# torch.backends.cudnn.benchmark = False


In [None]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [None]:
# exp.train(train_loader,"" valid_generator=valid_loader, epochs=epoch_number)

In [None]:
dimension = 300
num_layer = 2
bidirectional = True

lstm_network = nn.LSTM(
    input_size=dimension,
    hidden_size=dimension,
    num_layers=num_layer,
    bidirectional=bidirectional,
    batch_first=True,
)

input_dim = dimension * 2  # since bidirectional

fully_connected_network = nn.Linear(input_dim, tag_dimension)

full_network_bi_lstm = RecurrentNet(lstm_network, fully_connected_network)

In [None]:
print(list(full_network_bi_lstm.parameters()))


[Parameter containing:
tensor([[-0.0281,  0.0339,  0.0509,  ..., -0.0276, -0.0470,  0.0484],
        [-0.0231,  0.0153, -0.0200,  ..., -0.0223, -0.0112, -0.0491],
        [-0.0367, -0.0094,  0.0438,  ...,  0.0002,  0.0116,  0.0200],
        ...,
        [-0.0541,  0.0203, -0.0274,  ...,  0.0519,  0.0299,  0.0427],
        [ 0.0464,  0.0255, -0.0432,  ..., -0.0105, -0.0439,  0.0035],
        [ 0.0230, -0.0242,  0.0401,  ..., -0.0320,  0.0132,  0.0009]],
       requires_grad=True), Parameter containing:
tensor([[-0.0054, -0.0010, -0.0257,  ..., -0.0513,  0.0387, -0.0502],
        [-0.0502,  0.0305,  0.0139,  ..., -0.0380, -0.0515, -0.0116],
        [ 0.0265, -0.0244,  0.0560,  ...,  0.0133, -0.0458,  0.0145],
        ...,
        [ 0.0076,  0.0117, -0.0122,  ..., -0.0032, -0.0343, -0.0574],
        [ 0.0299, -0.0117,  0.0512,  ...,  0.0185,  0.0126,  0.0026],
        [ 0.0464, -0.0058, -0.0543,  ...,  0.0414,  0.0348,  0.0102]],
       requires_grad=True), Parameter containing:
tensor([-

In [None]:
# optimizer.add_parameters(full_network_bi_lstm.parameters())
optimizer = optim.SGD(full_network_bi_lstm.parameters(), lr)

In [None]:
exp_bi_lstm = Experiment(
    "./",
    full_network_bi_lstm,
    device=device,
    optimizer=optimizer,
    loss_function=cross_entropy,
    batch_metrics=["acc"],
)
exp_bi_lstm.train(train_loader, valid_generator=valid_loader, epochs=epoch_number)

Loading weights from ./checkpoint.ckpt and starting at epoch 11.
Loading optimizer state from ./checkpoint.optim and starting at epoch 11.
Loading random states from ./checkpoint.randomstate and starting at epoch 11.
Restoring data from ./checkpoint_epoch_10.ckpt


[]

In [None]:
# Assuming the model is named "full_network_bi_lstm"
torch.save(full_network_bi_lstm.state_dict(), 'model.pth')


In [None]:
# exp.test(test_loader)
exp_bi_lstm.test(test_loader)

Found best checkpoint at epoch: 10
lr: 0.1, loss: 2.11886, acc: 18.0592, val_loss: 2.11161, val_acc: 18.0724
Loading checkpoint ./checkpoint_epoch_10.ckpt
Running test
[35mTest steps: [36m12 [32m20.11s [35mtest_loss:[94m 2.111607[35m test_acc:[94m 18.072392[0m                                               


{'time': 20.114282491999802,
 'test_loss': 2.111607314809516,
 'test_acc': 18.07239246236137}

In [None]:
full_network_bi_lstm.to(device)
full_network_bi_lstm.eval()
res = []
tags_set = {
    'flat_apartment_number' : 0,
    'society_name' : 1,
    # 'building no': 2,
    'street': 2,
    'landmark': 3,
    'sub_locality' : 4,
    'area_locality_name' : 5,
    'city_town': 6,
    'pincode': 7,
    'unknown': 8
        }
test_sent ='b4 purandhar housing society uruli kanchan pune haveli 412202'
test_sent_vec = embedding_vectorizer(test_sent)
test_sent_tensor = torch.tensor([test_sent_vec], dtype=torch.float32).to(device)
test_sent_tensor_len = torch.tensor([test_sent_tensor.size()[1]], dtype=torch.long).to(device)


with torch.no_grad():
    # Convert the input tensor to a packed sequence
    test_sent_res = pack_padded_sequence(test_sent_tensor, test_sent_tensor_len.to('cpu'), batch_first=True, enforce_sorted=False)
    # packed_sequences = nn.utils.rnn.pack_padded_sequence(padded_tensor, valid_frames.to('cpu'), batch_first=True, enforce_sorted=True) 

    # test_sent_res = full_network_bi_lstm(test_sent_res, test_sent_tensor_len)
    test_sent_res = full_network_bi_lstm(test_sent_res)


    out = test_sent_res.cpu()[0]
    out = torch.argmax(out,dim=1)
    
    
    for c in out:
        res.append(list(tags_set.keys())[list(tags_set.values()).index(c.item())])

    print(f'Predicted: {res}')


Predicted: ['flat_apartment_number', 'landmark', 'street', 'sub_locality', 'unknown', 'unknown', 'unknown', 'unknown', 'city_town', 'unknown']


In [None]:
import torch
from torch.nn.utils.rnn import pack_padded_sequence
# from my_model import MyModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the saved model
model = torch.load('/content/complete_model.pt', map_location=device)
model.eval()

RecurrentNet(
  (lstm_network): LSTM(300, 300, num_layers=2, batch_first=True, bidirectional=True)
  (fully_connected_network): Linear(in_features=600, out_features=9, bias=True)
)

In [None]:
def unpack_packed_sequence(packed_sequence):
    return packed_sequence.data

In [None]:
import numpy as np
model.to(device)
model.eval()
res = []
tags_set = {
    'flat_apartment_number' : 0,
    'society_name' : 1,
    # 'building no': 2,
    'street': 2,
    'landmark': 3,
    'sub_locality' : 4,
    'area_locality_name' : 5,
    'city_town': 6,
    'pincode': 7,
    'unknown': 8
        }
test_sent ='G-909 Rohan Garima SB Road Shivajinagar Pune 411016'
test_sent_vec = embedding_vectorizer(test_sent)
test_sent_tensor = torch.tensor([test_sent_vec], dtype=torch.float32).to(device)
test_sent_tensor_len = torch.tensor([test_sent_tensor.size()[1]], dtype=torch.long).to(device)

with torch.no_grad():
    # Convert the input tensor to a packed sequence
    test_sent_res = pack_padded_sequence(test_sent_tensor, test_sent_tensor_len.to('cpu'), batch_first=True, enforce_sorted=False)

    # Pass the packed sequence through the model
    test_sent_res = model(test_sent_res)

    # Unpack the packed sequence to get the output tensor
    # test_sent_res, _ = pad_packed_sequence(test_sent_res, batch_first=True)
    test_sent_res = unpack_packed_sequence(test_sent_res)

    # Get the original lengths of the input sequences
    lengths = test_sent_tensor_len.cpu().numpy()

    # Iterate over the batch and exclude padded tokens
    for i in range(test_sent_res.size(0)):
        sequence = test_sent_res[i, :lengths[i]]
        predicted_tags = torch.argmax(sequence, dim=1)

        # Convert the predicted indices to corresponding tags
        res = [list(tags_set.keys())[list(tags_set.values()).index(c.item())] for c in predicted_tags]

        print(f'Predicted: {res}')


Predicted: ['flat_apartment_number', 'society_name', 'sub_locality', 'landmark', 'sub_locality', 'sub_locality', 'city_town', 'pincode']
