In [None]:
!pip install -q nltk scikit-learn tensorflow

In [None]:
# Imports and NLTK downloads
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Three Paragraphs
paragraphs = [
    # Paragraph 1: self-introduction
    ("Hello — I’m Gagan Sharma, a computer science student who is curious about how machines "
     "learn from data and how intelligent systems can make everyday tasks easier. I like breaking "
     "big problems into smaller parts and creating practical solutions using Python, machine "
     "learning tools, and cloud technologies. In college, I have studied data structures, "
     "algorithms, and machine learning, and I have completed projects involving image processing "
     "and natural language processing. Outside academics, I enjoy reading tech blogs, exploring "
     "open-source projects, and experimenting with presentation and design ideas. I value teamwork "
     "and clear communication, and I enjoy working with people who bring different perspectives. "
     "I learn best by building things, testing them, and improving them step by step. I am "
     "organized, motivated, and always ready for new challenges, and I hope to use my technical "
     "skills to solve real-world problems in a meaningful and effective way."),

    # Paragraph 2: future goals
    ("In the coming years, I want to build a strong career in machine learning and product "
     "development, where I can turn ideas into useful tools for people. My first goal is to work "
     "in applied ML roles, such as an ML engineer or data scientist, so I can gain practical "
     "experience in model building, deployment, and evaluation. I also want to learn more about "
     "cloud platforms, MLOps practices, and designing systems that scale well. Later, I hope to "
     "move into roles like ML product manager or research engineer, where I can help decide what "
     "problems to solve and how to build effective solutions. I am also interested in natural "
     "language processing and computer vision, and I want to contribute to projects that make AI "
     "more accurate, fair, and reliable. Ultimately, I want to build technologies that are useful, "
     "responsible, and meaningful for society."),

    # Paragraph 3: experience at LNMIIT
    ("My experience at LNMIIT has been a valuable mix of learning, teamwork, and practical "
     "exposure. The courses gave me a strong foundation in algorithms, systems, and programming, "
     "while lab assignments helped me apply these ideas through hands-on tasks. Coding events and "
     "mini-projects taught me time management, communication skills, and how to work well in "
     "teams. Faculty guidance and discussions with peers helped me understand different approaches "
     "to problem-solving and improved my debugging skills. Outside academics, I joined tech clubs "
     "and participated in hackathons, where I learned rapid prototyping, creativity, and how to "
     "present ideas clearly. These activities helped me turn concepts into working projects. "
     "Overall, LNMIIT has provided a supportive environment, helpful mentors, and opportunities "
     "that have helped me grow both technically and personally, preparing me well for internships "
     "and future goals.")
]


# Preprocessing functions
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove non-alphanumeric (keep spaces)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_text(text, do_lemmatize=True):
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]  # keep alphabetic tokens only
    tokens = [t for t in tokens if t not in stop_words]
    if do_lemmatize:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Preprocess paragraphs
preprocessed = [preprocess_text(p) for p in paragraphs]
for i, (orig, proc) in enumerate(zip(paragraphs, preprocessed)):
    print(f"\n--- Paragraph {i+1} (original snippet) ---\n{orig[:200]}...\n")
    print(f"--- Paragraph {i+1} (preprocessed) ---\n{proc}\n")

# Text Representation:
# Bag-of-Words
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(preprocessed)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
print("\nBag-of-Words feature matrix (rows=paragraphs):")
display(bow_df)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF feature matrix (rows=paragraphs):")
display(tfidf_df.round(3))

# Prepare data for RNN
# Strategy: split paragraphs into sentences to create more samples for training.
sentences = []
labels = []  # 0=intro, 1=goals, 2=experience
for idx, para in enumerate(paragraphs):
    sents = sent_tokenize(para)
    for s in sents:
        sentences.append(s)
        labels.append(idx)

print(f"\nTotal sentence samples: {len(sentences)}")
for i, (s, l) in enumerate(zip(sentences, labels)):
    print(f"[{i}] label={l} text={s[:80]}...")

# Preprocess sentences
sentences_proc = [preprocess_text(s) for s in sentences]

# Tokenize and pad sequences with Keras Tokenizer
vocab_size = 3000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences_proc)
sequences = tokenizer.texts_to_sequences(sentences_proc)
maxlen = max(len(seq) for seq in sequences)
padded = pad_sequences(sequences, padding='post', maxlen=maxlen)

# Convert labels to categorical
labels_array = np.array(labels)
num_classes = len(set(labels))
labels_cat = tf.keras.utils.to_categorical(labels_array, num_classes=num_classes)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(padded, labels_cat, test_size=0.25, random_state=42, stratify=labels_array)

# Simple RNN (Embedding + Bidirectional LSTM)
embedding_dim = 50

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.summary()

# Early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',   # stop when validation loss stops improving
    patience=3,           # wait 3 epochs before stopping
    restore_best_weights=True  # revert to best model weights
)

# Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=25,
    batch_size=4,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
pred_probs = model.predict(X_test)
preds = np.argmax(pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print("\nTest accuracy:", accuracy_score(y_true, preds))
print("\nClassification report:")
print(classification_report(y_true, preds, target_names=['intro', 'goals', 'experience']))

# Example: Predict class for a new sentence
examples = [
    "I enjoy building machine learning models and deploying them to production.",
    "At LNMIIT I participated in hackathons and learned teamwork.",
    "My long term aim is to work on scalable ML systems."
]
examples_proc = [preprocess_text(e) for e in examples]
ex_seq = tokenizer.texts_to_sequences(examples_proc)
ex_pad = pad_sequences(ex_seq, maxlen=maxlen, padding='post')
ex_preds = model.predict(ex_pad)
for ex, p in zip(examples, ex_preds):
    print(f"\nText: {ex}\nPredicted_probs: {p}\nPredicted_label: {np.argmax(p)}")


--- Paragraph 1 (original snippet) ---
Hello — I’m Gagan Sharma, a computer science student who is curious about how machines learn from data and how intelligent systems can make everyday tasks easier. I like breaking big problems into sma...

--- Paragraph 1 (preprocessed) ---
hello gagan sharma computer science student curious machine learn data intelligent system make everyday task easier like breaking big problem smaller part creating practical solution using python machine learning tool cloud technology college studied data structure algorithm machine learning completed project involving image processing natural language processing outside academic enjoy reading tech blog exploring open source project experimenting presentation design idea value teamwork clear communication enjoy working people bring different perspective learn best building thing testing improving step step organized motivated always ready new challenge hope use technical skill solve real world problem meaningfu

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,academic,accurate,activity,ai,algorithm,also,always,applied,apply,approach,...,valuable,value,vision,want,way,well,work,working,world,year
0,1,0,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,0,1,1,0
1,0,1,0,1,0,2,0,1,0,0,...,0,0,1,4,0,1,1,0,0,1
2,1,0,1,0,1,0,0,0,1,1,...,1,0,0,0,0,2,1,1,0,0



TF-IDF feature matrix (rows=paragraphs):


Unnamed: 0,academic,accurate,activity,ai,algorithm,also,always,applied,apply,approach,...,valuable,value,vision,want,way,well,work,working,world,year
0,0.081,0.0,0.0,0.0,0.081,0.0,0.106,0.0,0.0,0.0,...,0.0,0.106,0.0,0.0,0.106,0.0,0.0,0.081,0.106,0.0
1,0.0,0.099,0.0,0.099,0.0,0.197,0.0,0.099,0.0,0.0,...,0.0,0.0,0.099,0.395,0.0,0.075,0.075,0.0,0.0,0.099
2,0.081,0.0,0.106,0.0,0.081,0.0,0.0,0.0,0.106,0.106,...,0.106,0.0,0.0,0.0,0.0,0.161,0.081,0.081,0.0,0.0





Total sentence samples: 20
[0] label=0 text=Hello — I’m Gagan Sharma, a computer science student who is curious about how ma...
[1] label=0 text=I like breaking big problems into smaller parts and creating practical solutions...
[2] label=0 text=In college, I have studied data structures, algorithms, and machine learning, an...
[3] label=0 text=Outside academics, I enjoy reading tech blogs, exploring open-source projects, a...
[4] label=0 text=I value teamwork and clear communication, and I enjoy working with people who br...
[5] label=0 text=I learn best by building things, testing them, and improving them step by step....
[6] label=0 text=I am organized, motivated, and always ready for new challenges, and I hope to us...
[7] label=1 text=In the coming years, I want to build a strong career in machine learning and pro...
[8] label=1 text=My first goal is to work in applied ML roles, such as an ML engineer or data sci...
[9] label=1 text=I also want to learn more about cloud platforms

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 354ms/step

Text: I enjoy building machine learning models and deploying them to production.
Predicted_probs: [0.3341245  0.33253875 0.33333674]
Predicted_label: 0

Text: At LNMIIT I participated in hackathons and learned teamwork.
Predicted_probs: [0.3279177 0.3320767 0.3400057]
Predicted_label: 2

Text: My long term aim is to work on scalable ML systems.
Predicted_probs: [0.33337608 0.3346251  0.33199885]
Predicted_label: 1


#### This project showed how NLP and Deep Learning can classify sentences.
#### Few things that I observed were :
1. Lemmatization is more consistent and accurate than Stemming.
2. TV-IDF is a better metric for classification than Bag-of-Words.
3. Splitting paragraphs into sentences increased accuracy.
4. LSTM is good for capturing contextual meaning.

Even though dataset was very small RNN is still very effective in classfiying the example sentences.