# Practical 3- NLP
> Monish Gosar (J025)

# Part 1

## Train a model using GloVE embeddings with Vanilla RNNs

In [79]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

In [59]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [48]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [49]:
def preprocessing(text):
    text = re.sub(r'<.*?>', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Join tokens back into text
    text = ' '.join(tokens)
    
    return text
    

In [36]:
X = []
sentences = list(df['review'])
for sen in sentences:
    X.append(preprocessing(sen))

In [41]:
X[3]

'basically theres family little boy jake thinks theres zombie closet parents fighting timethis movie slower soap opera suddenly jake decides become rambo kill zombieok first youre going make film must decide thriller drama drama movie watchable parents divorcing arguing like real life jake closet totally ruins film expected see boogeyman similar movie instead watched drama meaningless thriller spots well playing parents descent dialogs shots jake ignore'

In [53]:
y = df['sentiment']

y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [63]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 256

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [65]:
import zipfile
import os

# Unzip GloVe embeddings
glove_zip_path = 'glove.6B.zip'  # Update this path
glove_dir = 'glove_embeddings'

def unzip_glove(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted GloVe embeddings to {extract_to}")

# Create directory if it doesn't exist
if not os.path.exists(glove_dir):
    os.makedirs(glove_dir)

# Unzip if the text file doesn't exist
glove_file = os.path.join(glove_dir, 'glove.6B.100d.txt')
if not os.path.exists(glove_file):
    unzip_glove(glove_zip_path, glove_dir)

Extracted GloVe embeddings to glove_embeddings


In [67]:
def load_glove_embeddings(filepath, word_index, embed_dim):
    embeddings_index = {}
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((vocab_size, embed_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

# Load GloVe embeddings
embedding_dim = 100
embedding_matrix = load_glove_embeddings(glove_file, tokenizer.word_index, embedding_dim)

In [71]:
max_length=256

In [73]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    SimpleRNN(64),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.2)



Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - accuracy: 0.5068 - loss: 0.6968 - val_accuracy: 0.5055 - val_loss: 0.6934
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 56ms/step - accuracy: 0.5137 - loss: 0.6921 - val_accuracy: 0.5156 - val_loss: 0.6925
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 77ms/step - accuracy: 0.5069 - loss: 0.6912 - val_accuracy: 0.5063 - val_loss: 0.6932
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 72ms/step - accuracy: 0.5072 - loss: 0.6906 - val_accuracy: 0.4979 - val_loss: 0.6943
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 77ms/step - accuracy: 0.5170 - loss: 0.6904 - val_accuracy: 0.4966 - val_loss: 0.6941
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 87ms/step - accuracy: 0.5063 - loss: 0.6909 - val_accuracy: 0.5045 - val_loss: 0.6927
Epoch 7/10
[1m2

<keras.src.callbacks.history.History at 0x1f6350f8450>

In [75]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5057 - loss: 0.6941
Test accuracy: 0.5060


## Train a model using GloVE embeddings with LSTMs

In [81]:
# Build the LSTM model
model = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {test_accuracy:.4f}")

Epoch 1/10




[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 423ms/step - accuracy: 0.5230 - loss: 0.6880 - val_accuracy: 0.5000 - val_loss: 0.6939
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 412ms/step - accuracy: 0.5066 - loss: 0.6934 - val_accuracy: 0.5027 - val_loss: 0.6923
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 435ms/step - accuracy: 0.5047 - loss: 0.6918 - val_accuracy: 0.5213 - val_loss: 0.6864
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 425ms/step - accuracy: 0.5106 - loss: 0.6917 - val_accuracy: 0.5027 - val_loss: 0.7114
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 421ms/step - accuracy: 0.5062 - loss: 0.6956 - val_accuracy: 0.5151 - val_loss: 0.6856
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 418ms/step - accuracy: 0.5142 - loss: 0.6902 - val_accuracy: 0.5052 - val_loss: 0.6927
Epoch 7/10
[1m

KeyboardInterrupt: 

## Repeat [1] and [2] with on-the-fly embeddings using torch

# PART 2

## Build a date parser using basic text processing and rules. (No ML models)

- Given a piece of text, extract the day, month and year info and present it in DD/MM/
YYYY format.
- Example: "l went to London on 21st June, 2024" 21/06/2024


In [97]:
import re
from datetime import datetime

def parse_date(text):
    # Dictionary to convert month names to numbers
    month_dict = {
        'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
        'may': '05', 'jun': '06', 'jul': '07', 'aug': '08',
        'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'
    }

    # Regular expression pattern to match various date formats
    pattern = r'\b(\d{1,2})(?:st|nd|rd|th)?\s+(?:of\s+)?([a-zA-Z]+)(?:\s*,?\s*|\s+)(\d{4}|\d{2})\b|\b([a-zA-Z]+)\s+(\d{1,2})(?:st|nd|rd|th)?(?:\s*,?\s*|\s+)(\d{4}|\d{2})\b'

    # Find all matches in the text
    # Find all matches in the text
    matches = re.findall(pattern, text)
    print(f"All matches: {matches}")

    if not matches:
        return None

    for match in matches:
        print(f"Current match: {match}")
        if match[0]:  # Format: day month year
            day, month, year = match[:3]
            print(f"Day-Month-Year format: day={day}, month={month}, year={year}")
        else:  # Format: month day year
            month, day, year = match[3:]
            print(f"Month-Day-Year format: month={month}, day={day}, year={year}")

        # Normalize day and month
        day = day.zfill(2)
        month = month.lower()[:3]
        print(f"After normalization: day={day}, month={month}")

        if month not in month_dict:
            print(f"Invalid month: {month}")
            continue

        month = month_dict[month]
        print(f"Month number: {month}")

        # Handle two-digit years
        if len(year) == 2:
            current_year = datetime.now().year
            century = str(current_year)[:2]
            year = century + year
            print(f"Expanded year: {year}")

        return f"{day}/{month}/{year}"

    return None

In [99]:
text = "The meeting is scheduled for May 5th, 2023"
result = parse_date(text)
print(f"Input: {text}")
print(f"Output: {result}")

All matches: [('', '', '', 'May', '5', '2023')]
Current match: ('', '', '', 'May', '5', '2023')
Month-Day-Year format: month=May, day=5, year=2023
After normalization: day=05, month=may
Month number: 05
Input: The meeting is scheduled for May 5th, 2023
Output: 05/05/2023


In [95]:

# Test the function
test_texts = [
    "I went to London on 21st June, 2024",
    "The meeting is scheduled for May 5th, 2023",
    "Her birthday is on January 15, 95",
    "The event will take place on the 3rd of April, 2022",
    "We met on Dec 7 2021"
]

for text in test_texts:
    result = parse_date(text)
    print(f"Input: {text}")
    print(f"Output: {result}")
    print()

Input: I went to London on 21st June, 2024
Output: 21/06/2024

Input: The meeting is scheduled for May 5th, 2023
Output: 05/05/2023

Input: Her birthday is on January 15, 95
Output: 15/01/2095

Input: The event will take place on the 3rd of April, 2022
Output: 03/04/2022

Input: We met on Dec 7 2021
Output: 07/12/2021

