<a href="https://colab.research.google.com/github/Ibraheem101/mlops/blob/main/foundations/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RNN

In [1]:
import os
import re
import json
import math
import nltk
import torch
import gensim
import random
import urllib
import itertools
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
SEED = 1234

In [3]:
def set_seeds(seed = 1234):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [4]:
set_seeds(seed = SEED)

In [5]:
cuda = True
device = torch.device('cuda' if (torch.cuda.is_available() and cuda) else 'cpu')
torch.set_default_tensor_type('torch.FloatTensor')

if torch.device == 'cuda':
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
print(device)

cpu


### Load Data

In [6]:
# Load data
url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/news.csv"
df = pd.read_csv(url, header=0) # load
df = df.sample(frac=1).reset_index(drop=True) # shuffle
df.head()

Unnamed: 0,title,category
0,Sharon Accepts Plan to Reduce Gaza Army Operat...,World
1,Internet Key Battleground in Wildlife Crime Fight,Sci/Tech
2,July Durable Good Orders Rise 1.7 Percent,Business
3,Growing Signs of a Slowing on Wall Street,Business
4,The New Faces of Reality TV,World


### Preprocessing

In [7]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [8]:
nltk.download("stopwords")
porter = PorterStemmer()

STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
def preprocess(text, stopwords = STOPWORDS):

    # Lowercase
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in parenthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text

In [10]:
# Sample
text = "Great week for the NYSE!"
preprocess(text=text)

'great week nyse'

In [11]:
df.title

0         Sharon Accepts Plan to Reduce Gaza Army Operat...
1         Internet Key Battleground in Wildlife Crime Fight
2                 July Durable Good Orders Rise 1.7 Percent
3                 Growing Signs of a Slowing on Wall Street
4                               The New Faces of Reality TV
                                ...                        
119995      Bush, Blair See Hope for Palestinian State (AP)
119996      Ex-Soldiers Vow to Bring Order to Haiti Capital
119997    Musharraf says U.S. must address root of terro...
119998           Nuclear materials  #39;vanish #39; in Iraq
119999    In Brief: Bowstreet unveils pre-packaged porta...
Name: title, Length: 120000, dtype: object

In [12]:
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print (f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")

Sharon Accepts Plan to Reduce Gaza Army Operation, Haaretz Says

sharon accepts plan reduce gaza army operation haaretz says


### Split data

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

In [16]:
def train_val_test_split(X, y, train_size):
    """Split dataset into data splits."""
    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [17]:
# Data
X = preprocessed_df["title"].values
y = preprocessed_df["category"].values

In [18]:
# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} → {y_train[0]}")

X_train: (84000,), y_train: (84000,)
X_val: (18000,), y_val: (18000,)
X_test: (18000,), y_test: (18000,)
Sample point: china battles north korea nuclear talks → World


### Label encoding

In [19]:
from collections import OrderedDict

In [20]:
class LabelEncoder(object):
    """
        Class to transform categorical labels into numerical values.

        Attributes:
            mapping (dict): A dictionary that maps labels to their corresponding numerical values.
            reverse_mapping (dict): A dictionary that maps numerical values back to their original labels.
            classes (list): A list of unique labels.

        Methods:
            fit(data): Fit the encoder to the given data by creating the mapping and reverse_mapping dictionaries.
            encode(data): Encode the given data by replacing labels with their corresponding numerical values.
            decode(data): Decode the given data by replacing numerical values with their original labels.
            __len__(): Return the number of unique labels in the encoder.
            __str__(): Return a string representation of the encoder.
            save(fp): Save the encoder's mapping dictionary to a JSON file.
            load(fp): Load a saved encoder from a JSON file.

    """


    def __init__(self):
        self.mapping = {}
        self.reverse_mapping = {}
        self.classes = []

    def fit(self, data):
        unique_labels = list(OrderedDict.fromkeys(data))
        for value, label in enumerate(unique_labels):
            self.mapping[label] = value
            self.reverse_mapping[value] = label
            self.classes.append(label)

    def encode(self, data):
        return [self.mapping[i] for i in data]

    def decode(self, data):
        return [self.reverse_mapping[j] for j in data]

    def __len__(self):
        return len(self.mapping)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {'mapping': self.mapping}
            json.dump(contents, fp, indent=4, sort_keys=False)

    # classmethod
    # def load(cls, fp):
    #     with open(fp, "r") as fp:
    #         kwargs = json.load(fp=fp)
    #     return cls(**kwargs)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            contents = json.load(fp=fp)
        encoder = cls()
        encoder.mapping = contents['mapping']
        encoder.reverse_mapping = {v: k for k, v in encoder.mapping.items()}
        encoder.classes = list(encoder.mapping.keys())
        return encoder

In [21]:
# Encode
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
label_encoder.mapping

{'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}

In [22]:
# Convert labels to tokens
print (f"y_train[0]: {y_train[0]}")
y_train_enc = label_encoder.encode(y_train)
y_val_enc = label_encoder.encode(y_val)
y_test_enc = label_encoder.encode(y_test)
print (f"y_train[0]: {y_train_enc[0]}")

y_train[0]: World
y_train[0]: 0


In [23]:
# Calculate class weights
class_weights = {}
total_samples = len(y_train_enc)
num_classes = len(np.unique(y_train_enc))
class_samples = np.bincount(y_train_enc)
for i in range(num_classes):
    class_weights[i] = total_samples / (num_classes * class_samples[i])

print(f"Class weights: {class_weights}")

Class weights: {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0}
