<a href="https://colab.research.google.com/github/Ibraheem101/mlops/blob/main/foundations/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RNN

In [1]:
import os
import re
import json
import math
import nltk
import torch
import gensim
import random
import urllib
import itertools
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
SEED = 1234

In [3]:
def set_seeds(seed = 1234):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [4]:
set_seeds(seed = SEED)

In [5]:
cuda = True
device = torch.device('cuda' if (torch.cuda.is_available() and cuda) else 'cpu')
torch.set_default_tensor_type('torch.FloatTensor')

if torch.device == 'cuda':
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
print(device)

cpu


### Load Data

In [6]:
# Load data
url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/news.csv"
df = pd.read_csv(url, header=0) # load
df = df.sample(frac=1).reset_index(drop=True) # shuffle
df.head()

Unnamed: 0,title,category
0,Sharon Accepts Plan to Reduce Gaza Army Operat...,World
1,Internet Key Battleground in Wildlife Crime Fight,Sci/Tech
2,July Durable Good Orders Rise 1.7 Percent,Business
3,Growing Signs of a Slowing on Wall Street,Business
4,The New Faces of Reality TV,World


### Preprocessing

In [7]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [8]:
nltk.download("stopwords")
porter = PorterStemmer()

STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
def preprocess(text, stopwords = STOPWORDS):

    # Lowercase
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in parenthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text

In [10]:
# Sample
text = "Great week for the NYSE!"
preprocess(text=text)

'great week nyse'

In [11]:
df.title

0         Sharon Accepts Plan to Reduce Gaza Army Operat...
1         Internet Key Battleground in Wildlife Crime Fight
2                 July Durable Good Orders Rise 1.7 Percent
3                 Growing Signs of a Slowing on Wall Street
4                               The New Faces of Reality TV
                                ...                        
119995      Bush, Blair See Hope for Palestinian State (AP)
119996      Ex-Soldiers Vow to Bring Order to Haiti Capital
119997    Musharraf says U.S. must address root of terro...
119998           Nuclear materials  #39;vanish #39; in Iraq
119999    In Brief: Bowstreet unveils pre-packaged porta...
Name: title, Length: 120000, dtype: object

In [12]:
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print (f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")

Sharon Accepts Plan to Reduce Gaza Army Operation, Haaretz Says

sharon accepts plan reduce gaza army operation haaretz says


### Split data