<a href="https://colab.research.google.com/github/Mohamedh0/Amit/blob/main/NLP_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mexwell/movie-quotes")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/mexwell/movie-quotes/versions/1


In [2]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

In [3]:
# Load the dataset
df=pd.read_csv(r'/root/.cache/kagglehub/datasets/mexwell/movie-quotes/versions/1/movie_quotes.csv')
df.head()

Unnamed: 0,quote,movie,type,year
0,"Do, or do not. There is no try.",Star Wars: Episode V - The Empire Strikes Back,movie,1890
1,Listen to them. Children of the night. What mu...,Dracula,movie,1931
2,It's alive! It's alive!,Frankenstein,movie,1931
3,"Oh, no, it wasn't the airplanes. It was Beauty...",King Kong,movie,1933
4,"Magic Mirror on the wall, who is the fairest o...",Snow White and the Seven Dwarves,movie,1937


In [4]:
# Clean up unnecessary columns
df.drop('year',axis=1,inplace=True)
# Separate features and target
X=df.drop('type',axis=1)
y=df['type']

In [5]:
def normalize_text_spacy(text):
    """
    Enhanced normalization of a single text string using spaCy:
    - Converts text to lowercase.
    - Removes URLs and email addresses.
    - Removes non-alphanumeric characters, punctuation, and extra spaces.
    - Tokenizes text.
    - Removes stopwords and numeric tokens.
    - Lemmatizes tokens.
    """
    # 1. Convert text to lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 3. Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # 4. Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 5. Process the text using spaCy
    doc = nlp(text)

    # 6. Remove stopwords, numeric tokens, punctuation, and lemmatize tokens
    normalized_tokens = [
        token.lemma_ for token in doc
        if token.text not in STOP_WORDS
        and not token.is_punct
        and not token.like_num
        and token.is_alpha
    ]

    # 7. Join tokens back into a single string
    return ' '.join(normalized_tokens)

In [6]:
for i in X.columns:
  X[i]=X[i].apply(normalize_text_spacy)
X.head()

Unnamed: 0,quote,movie
0,try,star war episode v empire strike
1,listen child night music,dracula
2,alive alive,frankenstein
3,oh not airplane beauty kill beast,king kong
4,magic mirror wall fair,snow white dwarf


In [7]:
# Combine all text columns into a single column
combined_text = X.apply(lambda row: ' '.join(row.astype(str)), axis=1)

# Create the Bag-of-Words matrix
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(combined_text)

# Display feature names and the BoW matrix
print(vectorizer.get_feature_names_out())
print(bow_matrix.toarray())

['abide' 'ability' 'able' ... 'zombieland' 'zoolander' 'zootopia']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
# Create the TF-IDF matrix
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(combined_text)

# Display feature names and the TF-IDF matrix
print(vectorizer.get_feature_names_out())
print(tf_idf_matrix.toarray())

['abide' 'ability' 'able' ... 'zombieland' 'zoolander' 'zootopia']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
