<a href="https://colab.research.google.com/github/Mayumor/mayuri_yewle_nlptasks/blob/main/rnn_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# rnn_assignment.py

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# --- Classification Task ---

# Load dataset
df = pd.read_csv("classification_data.csv")

# Check the column names in your DataFrame
print(df.columns)

# Split the 'text,label' column into two separate columns
df[['text', 'label']] = df['text,label'].str.split(pat=',', n=1, expand=True)

# Tokenize
tokenizer_cls = Tokenizer()
tokenizer_cls.fit_on_texts(df['text'])  # Now using the correct 'text' column
sequences = tokenizer_cls.texts_to_sequences(df['text'])  # Now using the correct 'text' column
word_index = tokenizer_cls.word_index
X = pad_sequences(sequences, padding='post')
# Encode labels
le = LabelEncoder()
y = le.fit_transform(df['label'])
y = to_categorical(y)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Build RNN model for classification
model_cls = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=32, input_length=X.shape[1]),
    SimpleRNN(64),
    Dense(32, activation='relu'),
    Dense(y.shape[1], activation='softmax')
])

model_cls.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cls.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Evaluate
loss, acc = model_cls.evaluate(X_test, y_test)
print("Classification Accuracy:", acc)

# --- Next Word Generation Task ---

# Load text corpus
with open("science_corpus.txt", "r") as f:
    text = f.read().lower()

tokenizer_gen = Tokenizer()
tokenizer_gen.fit_on_texts([text])
total_words = len(tokenizer_gen.word_index) + 1

# Generate input sequences
tokens = tokenizer_gen.texts_to_sequences([text])[0]
input_sequences = []

for i in range(10, len(tokens)):
    seq = tokens[i-10:i+1]
    input_sequences.append(seq)

input_sequences = np.array(input_sequences)
X_gen, y_gen = input_sequences[:, :-1], input_sequences[:, -1]
y_gen = to_categorical(y_gen, num_classes=total_words)

# Build generation model
model_gen = Sequential([
    Embedding(input_dim=total_words, output_dim=32, input_length=10),
    SimpleRNN(128),
    Dense(total_words, activation='softmax')
])

model_gen.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_gen.fit(X_gen, y_gen, epochs=30, verbose=1)

# Text generation function
def generate_text(seed_text, next_words, tokenizer, model, max_len=10):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        next_word = tokenizer.index_word.get(np.argmax(predicted), '')
        seed_text += " " + next_word
    return seed_text

# Generate and print example
start = "cells divide through mitosis and meiosis"
generated = generate_text(start, 20, tokenizer_gen, model_gen)
print("\nGenerated Text:\n", generated)


Index(['text,label'], dtype='object')
Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.3333 - loss: 1.3676 - val_accuracy: 0.0000e+00 - val_loss: 1.4588
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - accuracy: 0.5000 - loss: 1.3177 - val_accuracy: 0.0000e+00 - val_loss: 1.4693
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step - accuracy: 0.8333 - loss: 1.2732 - val_accuracy: 0.0000e+00 - val_loss: 1.4813
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319ms/step - accuracy: 0.8333 - loss: 1.2307 - val_accuracy: 0.0000e+00 - val_loss: 1.4907
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step - accuracy: 1.0000 - loss: 1.1866 - val_accuracy: 0.0000e+00 - val_loss: 1.4985
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step - accuracy: 1.0000 - loss: 1.1442 - val_accuracy: 0.0000e+00 - val_loss: 1.5072
Epoch 7/10
[1m1/1[0m [32m━━

In [None]:
pip install tensorflow pandas scikit-learn
