In [1]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import pos_tag

# Ensure necessary NLTK datasets are downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Load reviews from the second column of the CSV file (no header)
df = pd.read_csv('IA2.csv', header=None, usecols=[1])
reviews = df[1].tolist()

# Step 1: Tokenization
tokenized_reviews = [word_tokenize(review.lower()) for review in reviews]

# Step 2: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_reviews = [[lemmatizer.lemmatize(token) for token in tokens] for tokens in tokenized_reviews]

# Step 3: Remove Stop-words and Punctuation
stop_words = set(stopwords.words('english') + list(string.punctuation))
filtered_reviews = [[token for token in tokens if token not in stop_words] for tokens in lemmatized_reviews]

# Step 4: TF-IDF Vectorization (Including 2-grams)
preprocessed_reviews = [' '.join(tokens) for tokens in filtered_reviews]
tfidf_vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1, 2))
tfidf_vectors = tfidf_vectorizer.fit_transform(preprocessed_reviews)

# Save the TF-IDF vectors to a CSV file
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.to_csv('tfidf_vectors.csv', index=False)

# Step 5: POS-Tagging and TF-IDF Vectorization
pos_tagged_reviews = [pos_tag(tokens) for tokens in tokenized_reviews]
pos_tagged_reviews_flat = [' '.join([f'{word}_{tag}' for word, tag in review]) for review in pos_tagged_reviews]

tfidf_vectorizer_pos = TfidfVectorizer(min_df=4, norm=None, use_idf=True, smooth_idf=True)
tfidf_vectors_pos = tfidf_vectorizer_pos.fit_transform(pos_tagged_reviews_flat)

# Saving the POS-tagged TF-IDF vectors to a CSV file
tfidf_df_pos = pd.DataFrame(tfidf_vectors_pos.toarray(), columns=tfidf_vectorizer_pos.get_feature_names_out())
tfidf_df_pos.to_csv('tfidf_vectors_pos.csv', index=False)

# Calculate dimensions
tfidf_vectors_shape = tfidf_vectors.shape
tfidf_vectors_pos_shape = tfidf_vectors_pos.shape

print("TF-IDF vectors and POS-tagged TF-IDF vectors are saved.")
print(f"Step 4 TF-IDF Vector Dimensions: {tfidf_vectors_shape}")
print(f"Step 5 POS-Tagged TF-IDF Vector Dimensions: {tfidf_vectors_pos_shape}")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


TF-IDF vectors and POS-tagged TF-IDF vectors are saved.
Step 4 TF-IDF Vector Dimensions: (100, 1383)
Step 5 POS-Tagged TF-IDF Vector Dimensions: (100, 936)
