### Week 11 – Natural Language Processing (NLP Basics)

#### Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


#### Download Required NLTK Resources

In [11]:
import nltk

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)


True

#### Create Sample Text Dataset

(For Assignment 11 – text preprocessing)

In [3]:
data = {
    'text': [
        "This project uses AI for house price prediction",
        "Machine learning and deep learning are powerful tools",
        "Natural Language Processing is used for text analysis",
        "Tokenization and TF IDF are basic NLP steps"
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text
0,This project uses AI for house price prediction
1,Machine learning and deep learning are powerfu...
2,Natural Language Processing is used for text a...
3,Tokenization and TF IDF are basic NLP steps


#### Text Cleaning

(lowercase + remove symbols)

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_text'] = df['text'].apply(clean_text)
df


Unnamed: 0,text,clean_text
0,This project uses AI for house price prediction,this project uses ai for house price prediction
1,Machine learning and deep learning are powerfu...,machine learning and deep learning are powerfu...
2,Natural Language Processing is used for text a...,natural language processing is used for text a...
3,Tokenization and TF IDF are basic NLP steps,tokenization and tf idf are basic nlp steps


#### Tokenization

(Splitting sentences into words)

In [5]:
df['tokens'] = df['clean_text'].apply(word_tokenize)
df


Unnamed: 0,text,clean_text,tokens
0,This project uses AI for house price prediction,this project uses ai for house price prediction,"[this, project, uses, ai, for, house, price, p..."
1,Machine learning and deep learning are powerfu...,machine learning and deep learning are powerfu...,"[machine, learning, and, deep, learning, are, ..."
2,Natural Language Processing is used for text a...,natural language processing is used for text a...,"[natural, language, processing, is, used, for,..."
3,Tokenization and TF IDF are basic NLP steps,tokenization and tf idf are basic nlp steps,"[tokenization, and, tf, idf, are, basic, nlp, ..."


#### Stopword Removal

(Removing words like: is, the, and, for)

In [6]:
stop_words = set(stopwords.words('english'))

df['tokens_no_stopwords'] = df['tokens'].apply(
    lambda words: [w for w in words if w not in stop_words]
)

df


Unnamed: 0,text,clean_text,tokens,tokens_no_stopwords
0,This project uses AI for house price prediction,this project uses ai for house price prediction,"[this, project, uses, ai, for, house, price, p...","[project, uses, ai, house, price, prediction]"
1,Machine learning and deep learning are powerfu...,machine learning and deep learning are powerfu...,"[machine, learning, and, deep, learning, are, ...","[machine, learning, deep, learning, powerful, ..."
2,Natural Language Processing is used for text a...,natural language processing is used for text a...,"[natural, language, processing, is, used, for,...","[natural, language, processing, used, text, an..."
3,Tokenization and TF IDF are basic NLP steps,tokenization and tf idf are basic nlp steps,"[tokenization, and, tf, idf, are, basic, nlp, ...","[tokenization, tf, idf, basic, nlp, steps]"


#### Join Tokens Back to Text

(Required for TF-IDF)

In [7]:
df['final_text'] = df['tokens_no_stopwords'].apply(lambda x: ' '.join(x))
df


Unnamed: 0,text,clean_text,tokens,tokens_no_stopwords,final_text
0,This project uses AI for house price prediction,this project uses ai for house price prediction,"[this, project, uses, ai, for, house, price, p...","[project, uses, ai, house, price, prediction]",project uses ai house price prediction
1,Machine learning and deep learning are powerfu...,machine learning and deep learning are powerfu...,"[machine, learning, and, deep, learning, are, ...","[machine, learning, deep, learning, powerful, ...",machine learning deep learning powerful tools
2,Natural Language Processing is used for text a...,natural language processing is used for text a...,"[natural, language, processing, is, used, for,...","[natural, language, processing, used, text, an...",natural language processing used text analysis
3,Tokenization and TF IDF are basic NLP steps,tokenization and tf idf are basic nlp steps,"[tokenization, and, tf, idf, are, basic, nlp, ...","[tokenization, tf, idf, basic, nlp, steps]",tokenization tf idf basic nlp steps


#### TF-IDF Vectorization

(Convert text into numerical features)

In [8]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['final_text'])

print("TF-IDF Shape:", X_tfidf.shape)


TF-IDF Shape: (4, 23)


#### View TF-IDF Feature Matrix

In [9]:
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=vectorizer.get_feature_names_out()
)

tfidf_df


Unnamed: 0,ai,analysis,basic,deep,house,idf,language,learning,machine,natural,...,price,processing,project,steps,text,tf,tokenization,tools,used,uses
0,0.408248,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,...,0.408248,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.408248
1,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.707107,0.353553,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.0
2,0.0,0.408248,0.0,0.0,0.0,0.0,0.408248,0.0,0.0,0.408248,...,0.0,0.408248,0.0,0.0,0.408248,0.0,0.0,0.0,0.408248,0.0
3,0.0,0.0,0.408248,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.408248,0.0,0.408248,0.408248,0.0,0.0,0.0


#### Simple Verification Test

In [10]:
word_tokenize("NLP is working fine")

['NLP', 'is', 'working', 'fine']