<a href="https://colab.research.google.com/github/Mohamedh0/Amit/blob/main/NLP_task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1


In [23]:
import string
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [24]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
# Load the dataset
df = pd.read_csv(r'/root/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1/spam.csv', encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [26]:
# Clean up unnecessary columns
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

# Rename columns for better clarity
df.columns = ['label', 'message']

# Separate features and target
X = df['message']
y = df['label']

In [27]:
# Define normalization function
def normalize_text(text):
    """
    Normalization of a single text string using NLTK:
    - Converts text to lowercase.
    - Removes URLs and email addresses.
    - Removes non-alphanumeric characters, punctuation, and extra spaces.
    - Tokenizes text.
    - Removes stopwords and numeric tokens.
    - Lemmatizes tokens.
    """
    # 1. Convert text to lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 3. Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # 4. Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 5. Tokenize the text
    tokens = word_tokenize(text)

    # 6. Remove stopwords and numeric tokens
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [
        token for token in tokens
        if token not in stop_words and token.isalpha()
    ]

    # 7. Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # 8. Join tokens back into a single string
    return ' '.join(lemmatized_tokens)

In [28]:
X=X.apply(normalize_text)
X.head()

Unnamed: 0,message
0,go jurong point crazy available bugis n great ...
1,ok lar joking wif u oni
2,free entry wkly comp win fa cup final tkts st ...
3,u dun say early hor u c already say
4,nah dont think go usf life around though


In [29]:
# Create the Bag-of-Words matrix
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(X)

# Display feature names and the BoW matrix
print(vectorizer.get_feature_names_out())
print(bow_matrix.toarray())

['aa' 'aah' 'aaniye' ... 'zoom' 'zouk' 'zyada']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [30]:
# Create the TF-IDF matrix
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(X)

# Display feature names and the TF-IDF matrix
print(vectorizer.get_feature_names_out())
print(tf_idf_matrix.toarray())

['aa' 'aah' 'aaniye' ... 'zoom' 'zouk' 'zyada']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
