# Dissertation Project: PCA and Linear Regression on Anime Ratings
This project aims to test the hypothesis that applying Principal Component Analysis (PCA) as a dimensionality reduction technique for the extracted TF-IDF features and user review category scores can improve the performance of a Linear Regression model used to predict overall anime ratings, as measured by lower Mean Squared Error (MSE) and Mean Absolute Error (MAE) on the validation and test datasets, compared to a Linear Regression model without PCA.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Data Loading and Preprocessing
We will load the datasets, preprocess the data and extract the necessary features for our analysis.


In [2]:
# Load datasets
animes = pd.read_csv("animes.csv")
profiles = pd.read_csv("profiles.csv")
reviews = pd.read_csv("reviews.csv")

# Merge animes and reviews datasets
merged_data = reviews.merge(animes, left_on='anime_uid', right_on='uid')

# Preprocess text data
merged_data['text'] = merged_data['text'].str.lower()

# Calculate average review score
merged_data['avg_review_score'] = merged_data[['score_x', 'score_y']].mean(axis=1)


# Feature Extraction using TF-IDF
We will use the TF-IDF method to extract features from the review text.


In [3]:
# Extract features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(merged_data['text'])

# Create target variable
y = merged_data['avg_review_score']


# Train-Validation-Test Split
We will split the data into train, validation, and test sets.

In [4]:
# Split data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# PCA for Dimensionality Reduction
We will apply PCA to reduce the dimensionality of the extracted features.

In [5]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train.toarray())
X_val_pca = pca.transform(Xi_val.toarray())
X_test_pca = pca.transform(X_test.toarray())

# Linear Regression Model
We will train Linear Regression models with and without PCA, and compare their performance on validation and test datasets.


In [None]:
# Train Linear Regression models with and without PCA
lr_no_pca = LinearRegression()
lr_pca = LinearRegression()

lr_pca.fit(X_train_pca, y_train)

# Predictions on validation and test datasets
y_val_pred_no_pca = lr_no_pca.predict(X_val)
y_val_pred_pca = lr_pca.predict(X_val_pca)
y_test_pred_no_pca = lr_no_pca.predict(X_test)
y_test_pred_pca = lr_pca.predict(X_test_pca)

# Calculate MSE and MAE for models with and without PCA
mse_val_no_pca = mean_squared_error(y_val, y_val_pred_no_pca)
mse_val_pca = mean_squared_error(y_val, y_val_pred_pca)
mae_val_no_pca = mean_absolute_error(y_val, y_val_pred_no_pca)
mae_val_pca = mean_absolute_error(y_val, y_val_pred_pca)

mse_test_no_pca = mean_squared_error(y_test, y_test_pred_no_pca)
mse_test_pca = mean_squared_error(y_test, y_test_pred_pca)
mae_test_no_pca = mean_absolute_error(y_test, y_test_pred_no_pca)
mae_test_pca = mean_absolute_error(y_test, y_test_pred_pca)

# Print results
print("Validation MSE without PCA:", mse_val_no_pca)
print("Validation MSE with PCA:", mse_val_pca)
print("Validation MAE without PCA:", mae_val_no_pca)
print("Validation MAE with PCA:", mae_val_pca)
print("\nTest MSE without PCA:", mse_test_no_pca)
print("Test MSE with PCA:", mse_test_pca)
print("Test MAE without PCA:", mae_test_no_pca)
print("Test MAE with PCA:", mae_test_pca)
