# Lab 10: NLP - Classification

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Load dataset
data = pd.read_csv('musical1.tsv', sep='\t')
# Extract reviews and scores
reviews = data['Review']
scores = data['Score']

# Function to tokenize a review
def tokenizeReviews(review):
    return word_tokenize(review.lower())

# Tokenizing all reviews
tokenized_reviews = reviews.apply(tokenizeReviews)

# Porter Stemmer
porterStemmer = PorterStemmer()

# Function to stem tokens for all reviews
def stemReviews(tokenizedReviews):
    return [porterStemmer.stem(token) for token in tokenizedReviews]

# Stemming all tokens in all reviews
stemmed_reviews = tokenized_reviews.apply(stemReviews)

# Lemmatization
lemmatizer = WordNetLemmatizer()

# Function to lemmatize reviews
def lemmatizeReviews(stemmedReviews):
    return [lemmatizer.lemmatize(token) for token in stemmedReviews]

# Lemmatize stemmed reviews
lemmatized_reviews = stemmed_reviews.apply(lemmatizeReviews)

# Vectorization
vectorizer = CountVectorizer()
vectorized = vectorizer.fit_transform(lemmatized_reviews.apply(' '.join))

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(vectorized, scores, test_size=0.2, random_state=42)

# Building Random Forest model
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)

# Predictions
y_pred = forest_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")



Accuracy: 0.775
Precision: 0.7851239669421488
Recall: 0.8333333333333334
F1-Score: 0.8085106382978725
