<a href="https://colab.research.google.com/github/MojTabaa4/POS-Tagging-and-Named-Entity-Recognition-with-RNNs/blob/main/POS_NER_with_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow

In [10]:
import random
import time
from collections import defaultdict
from typing import List, Tuple

import keras
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from gensim.models import KeyedVectors, Word2Vec
from keras.layers import (GRU, LSTM, RNN, Bidirectional, Dense, Dropout,
                          Embedding, Input, Masking, SimpleRNN,
                          SpatialDropout1D, TimeDistributed)
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
from nltk.tree import Tree
from sklearn.model_selection import train_test_split
from tensorflow.keras import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
class PTBPosLoader:
    """
    A class for loading and preprocessing the Penn Treebank Part of Speech dataset.
    """
    def __init__(self, use_universal_tagset: bool = True, test_size: float = 0.1, val_size: float = 0.1):
        """
        Initializes the PTBPosLoader object.

        Args:
            use_universal_tagset (bool): Whether to use the universal tagset. Default is True.
            test_size (float): The proportion of the dataset to use for testing. Default is 0.1.
            val_size (float): The proportion of the training set to use for validation. Default is 0.1.
        """
        if use_universal_tagset:
            self.ptb = list(treebank.tagged_sents(tagset='universal'))
        else:
            self.ptb = list(treebank.tagged_sents())

        self.test_size = test_size
        self.val_size = val_size
        self._split_train_val_test_sets()

    def _split_train_val_test_sets(self) -> None:
        """
        Splits the dataset into training, validation, and testing sets.
        """
        self.train_set, self.test_set = train_test_split(self.ptb, 
                                                         test_size=self.test_size,
                                                         random_state=100)
        self.train_set, self.val_set = train_test_split(self.train_set,
                                                        test_size=self.val_size,
                                                        random_state=100)

    def _extract_all_word_tag_pairs(self) -> None:
        """
        Extracts all word-tag pairs from the training, validation, and testing sets.
        """
        self.train_word_tag_pairs = [word_tag for record in self.train_set for word_tag in record]
        self.val_word_tag_pairs = [word_tag for record in self.val_set for word_tag in record]
        self.test_word_tag_pairs = [word_tag for record in self.test_set for word_tag in record]

    def get_vocab_and_tagset(self) -> Tuple[set, List[str]]:
        """
        Returns a tuple containing the vocabulary and tagset of the training set.

        Returns:
            A tuple containing the vocabulary (set) and tagset (list of strings) of the training set.
        """
        self._extract_all_word_tag_pairs()
        vocab = set([word_tag[0] for word_tag in self.train_word_tag_pairs])
        tagset = sorted(list(set([pair[1] for pair in self.train_word_tag_pairs])))
        return vocab, tagset

    def get_train_val_test_sets(self) -> Tuple[List[List[Tuple[str, str]]], List[List[Tuple[str, str]]], List[List[Tuple[str, str]]]]:
        """
        Returns a tuple containing the training, validation, and testing sets.

        Returns:
            A tuple containing the training, validation, and testing sets, each as a list of sentences, where each sentence 
            is a list of word-tag pairs (tuples).
        """
        return self.train_set, self.val_set, self.test_set