In [13]:
import nltk
from nltk.tokenize import word_tokenize

# Sample review
review = "The movie was fantastic! The acting was superb and the plot was engaging."

# Tokenize the review
tokens = word_tokenize(review)

# Print the tokens
print("Original Review:", review)
print("Tokenized Review:", tokens)

Original Review: The movie was fantastic! The acting was superb and the plot was engaging.
Tokenized Review: ['The', 'movie', 'was', 'fantastic', '!', 'The', 'acting', 'was', 'superb', 'and', 'the', 'plot', 'was', 'engaging', '.']


In [14]:
# Sample review
review = "The movie was fantastic! The acting was superb and the plot was engaging."

# Convert the review to lowercase
lowercase_review = review.lower()

# Print the lowercase review
print("Original Review:", review)
print("Lowercase Review:", lowercase_review)

Original Review: The movie was fantastic! The acting was superb and the plot was engaging.
Lowercase Review: the movie was fantastic! the acting was superb and the plot was engaging.


In [15]:
import string

# Sample review
review = "The movie was fantastic! The acting was superb and the plot was engaging."

# Define punctuation characters
punctuation_chars = string.punctuation

# Remove punctuation from the review
cleaned_review = review.translate(str.maketrans('', '', punctuation_chars))

# Print the cleaned review
print("Original Review:", review)
print("Cleaned Review:", cleaned_review)

Original Review: The movie was fantastic! The acting was superb and the plot was engaging.
Cleaned Review: The movie was fantastic The acting was superb and the plot was engaging


In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Sample review
review = "The movie was fantastic! The acting was superb and the plot was engaging."

# Tokenize the review
tokens = word_tokenize(review)

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from the review
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Join the filtered tokens back into a string
filtered_review = ' '.join(filtered_tokens)

# Print the original and filtered review
print("Original Review:", review)
print("Review after Removing Stopwords:", filtered_review)

Original Review: The movie was fantastic! The acting was superb and the plot was engaging.
Review after Removing Stopwords: movie fantastic ! acting superb plot engaging .


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

# Sample review
review = "The movie was fantastic! The acting was superb and the plot was engaging."

# Tokenize the review
tokens = word_tokenize(review)

# Initialize the PorterStemmer
porter = PorterStemmer()

# Apply stemming to each token
stemmed_tokens = [porter.stem(word) for word in tokens]

# Join the stemmed tokens back into a string
stemmed_review = ' '.join(stemmed_tokens)

# Print the original and stemmed review
print("Original Review:", review)
print("Review after Stemming:", stemmed_review)

Original Review: The movie was fantastic! The acting was superb and the plot was engaging.
Review after Stemming: the movi wa fantast ! the act wa superb and the plot wa engag .


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
!pip install spacy




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: C:\Users\Acer\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [19]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Sample review
review = "The movie was fantastic! The acting was superb and the plot was engaging."

# Process the review using SpaCy
doc = nlp(review)

# Lemmatize each token in the review
lemmatized_review = ' '.join([token.lemma_ for token in doc])

# Print the original and lemmatized review
print("Original Review:", review)
print("Review after Lemmatization:", lemmatized_review)


Original Review: The movie was fantastic! The acting was superb and the plot was engaging.
Review after Lemmatization: the movie be fantastic ! the acting be superb and the plot be engaging .


In [20]:
# Sample review
review = "The movie was fantastic! The acting was superb and the plot was engaging. 10/10 would recommend."

# Remove numeric characters from the review
cleaned_review = ''.join([char for char in review if not char.isdigit()])

# Print the original and cleaned review
print("Original Review:", review)
print("Review after Removing Numeric Characters:", cleaned_review)

Original Review: The movie was fantastic! The acting was superb and the plot was engaging. 10/10 would recommend.
Review after Removing Numeric Characters: The movie was fantastic! The acting was superb and the plot was engaging. / would recommend.


In [21]:
# Sample review
review = "I can't believe they won't show up tonight."

# Define a dictionary of common English contractions and their expanded forms
contractions = {
    "can't": "cannot",
    "won't": "will not",
    # Add more contractions and their expanded forms as needed
}

# Function to handle contractions
def expand_contractions(review, contractions):
    for contraction, expansion in contractions.items():
        review = review.replace(contraction, expansion)
    return review

# Apply handling contractions to the sample review
expanded_review = expand_contractions(review, contractions)

# Print the original and expanded review
print("Original Review:", review)
print("Review after Handling Contractions:", expanded_review)


Original Review: I can't believe they won't show up tonight.
Review after Handling Contractions: I cannot believe they will not show up tonight.


In [22]:
import re

# Sample review with HTML tags
review = "<p>The movie <b>was</b> fantastic! <i>The</i> acting was superb and the plot was engaging.</p>"

# Function to remove HTML tags
def remove_html_tags(review):
    clean_review = re.sub(r'<[^>]+>', '', review)
    return clean_review

# Apply removing HTML tags to the sample review
cleaned_review = remove_html_tags(review)

# Print the original and cleaned review
print("Original Review:", review)
print("Review after Removing HTML Tags:", cleaned_review)


Original Review: <p>The movie <b>was</b> fantastic! <i>The</i> acting was superb and the plot was engaging.</p>
Review after Removing HTML Tags: The movie was fantastic! The acting was superb and the plot was engaging.


In [23]:
import re

# Sample review with special characters and symbols
review = "The movie was fantastic! The acting was superb, and the plot was engaging. 😊🎬"

# Function to handle special characters and symbols
def handle_special_characters(review):
    # Remove non-alphanumeric characters and whitespace
    cleaned_review = re.sub(r'[^a-zA-Z0-9\s]', '', review)
    return cleaned_review

# Apply handling special characters and symbols to the sample review
cleaned_review = handle_special_characters(review)

# Print the original and cleaned review
print("Original Review:", review)
print("Review after Handling Special Characters and Symbols:", cleaned_review)


Original Review: The movie was fantastic! The acting was superb, and the plot was engaging. 😊🎬
Review after Handling Special Characters and Symbols: The movie was fantastic The acting was superb and the plot was engaging 


In [24]:
import unicodedata

# Sample review with accented characters
review = "Thé mõvie wàs fántástic! The àcting wás superb, and the plot was engaging."

# Function to normalize text
def normalize_text(review):
    # Normalize accented characters to ASCII equivalents
    normalized_review = unicodedata.normalize('NFKD', review).encode('ascii', 'ignore').decode('utf-8')
    return normalized_review

# Apply normalization to the sample review
normalized_review = normalize_text(review)

# Print the original and normalized review
print("Original Review:", review)
print("Review after Normalization:", normalized_review)


Original Review: Thé mõvie wàs fántástic! The àcting wás superb, and the plot was engaging.
Review after Normalization: The movie was fantastic! The acting was superb, and the plot was engaging.


In [25]:
from collections import Counter

# Sample review with rare words or typos
review = "The movi was fantastic! The actng was superb and the plot was engaging."

# Function to handle rare words or typos
def handle_rare_words(review, threshold=1):
    # Tokenize the review
    tokens = review.split()
    
    # Count the frequency of each word
    word_counts = Counter(tokens)
    
    # Identify rare words (occurring less than threshold times)
    rare_words = [word for word, count in word_counts.items() if count <= threshold]
    
    # Replace rare words with a placeholder (e.g., '<UNK>')
    cleaned_review = ' '.join(['<UNK>' if word in rare_words else word for word in tokens])
    
    return cleaned_review

# Apply handling rare words or typos to the sample review
cleaned_review = handle_rare_words(review)

# Print the original and cleaned review
print("Original Review:", review)
print("Review after Handling Rare Words or Typos:", cleaned_review)


Original Review: The movi was fantastic! The actng was superb and the plot was engaging.
Review after Handling Rare Words or Typos: The <UNK> was <UNK> The <UNK> was <UNK> <UNK> <UNK> <UNK> was <UNK>


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Sample review
review = "The movie was fantastic! The acting was superb and the plot was engaging."

# Define a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the review and transform it into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform([review])

# Convert the TF-IDF matrix to a dense array
dense_tfidf_matrix = tfidf_matrix.toarray()

# Extract features from the TF-IDF matrix
features = vectorizer.get_feature_names_out()

# Convert the features and their corresponding TF-IDF values to a dictionary
feature_tfidf_dict = dict(zip(features, dense_tfidf_matrix.flatten()))

# Print the features and their TF-IDF values
print("Feature Engineering Result:")
for feature, tfidf_value in feature_tfidf_dict.items():
    print(f"{feature}: {tfidf_value}")


Feature Engineering Result:
acting: 0.2
and: 0.2
engaging: 0.2
fantastic: 0.2
movie: 0.2
plot: 0.2
superb: 0.2
the: 0.6
was: 0.6
