# **Read Training and Testing Dataset**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import scipy
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

In [None]:
train = pd.read_csv("project_data_files/book_rating_train.csv")
test = pd.read_csv("project_data_files/book_rating_test.csv")

# **Feature Selection**

In [None]:
import re
def preprocess_text(text):
    # Remove punctuation, lowercase, and split text into words
    words = re.sub(r'\W+', ' ', text.lower()).split()
    return ' '.join(words)

train['preprocessed_name'] = train['Name'].apply(preprocess_text)
train['preprocessed_authors'] = train['Authors'].apply(preprocess_text)
train['preprocessed_description'] = train['Description'].apply(preprocess_text)


In [None]:
low_rating = train.loc[train['rating_label'] == 3.0].index
med_rating = train.loc[train['rating_label'] == 4.0].index
high_rating = train.loc[train['rating_label'] == 5.0].index

In [None]:
# Load pretrained GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim=50):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove = load_glove_embeddings("glove.6B.50d.txt", embedding_dim=50)

def average_glove_embedding(text, embedding_model, embedding_size=50):
    words = text.split()
    word_embeddings = []

    for word in words:
        if word in embedding_model:
            word_embeddings.append(embedding_model[word])

    if not word_embeddings:
        return np.zeros(embedding_size)

    word_embeddings = [embedding for embedding in word_embeddings if embedding.shape == (embedding_size,)]

    if not word_embeddings:
        return np.zeros(embedding_size)

    return np.mean(word_embeddings, axis=0)

name_vec_glove = np.array([average_glove_embedding(text, glove) for text in train['preprocessed_name']])
authors_vec_glove = np.array([average_glove_embedding(text, glove) for text in train['preprocessed_authors']])
desc_vec_glove = np.array([average_glove_embedding(text, glove) for text in train['preprocessed_description']])


In [None]:
def reduce_dimensionality_PCA(embeddings, n_components=2):
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings)
    return reduced_embeddings


def reduce_dimensionality_SVD(embeddings, n_components=2):
    svd = TruncatedSVD(n_components=n_components)
    reduced_embeddings = svd.fit_transform(embeddings)
    return reduced_embeddings

# Load Bag-of-Words features
name_vec_bow = scipy.sparse.load_npz('project_data_files/book_text_features_countvec/train_name_vec.npz')
authors_vec_bow = scipy.sparse.load_npz('project_data_files/book_text_features_countvec/train_authors_vec.npz')
desc_vec_bow = scipy.sparse.load_npz('project_data_files/book_text_features_countvec/train_desc_vec.npz')

# Load Doc2Vec features
name_doc2vec = pd.read_csv("project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col=False, delimiter=',', header=None)
authors_doc2vec = pd.read_csv("project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col=False, delimiter=',', header=None)
desc_doc2vec = pd.read_csv("project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col=False, delimiter=',', header=None)

# Reduce dimensionality for all embeddings
name_bow_2d = reduce_dimensionality_SVD(name_vec_bow)
authors_bow_2d = reduce_dimensionality_SVD(authors_vec_bow)
desc_bow_2d = reduce_dimensionality_SVD(desc_vec_bow)

name_d2v_2d = reduce_dimensionality_PCA(name_doc2vec)
authors_d2v_2d = reduce_dimensionality_PCA(authors_doc2vec)
desc_d2v_2d = reduce_dimensionality_PCA(desc_doc2vec)

name_glove_2d = reduce_dimensionality_PCA(name_vec_glove)
authors_glove_2d = reduce_dimensionality_PCA(authors_vec_glove)
desc_glove_2d = reduce_dimensionality_PCA(desc_vec_glove)


In [None]:
def plot_embeddings(dataset_reduced, title):
    plt.scatter(dataset_reduced[high_rating, 0], dataset_reduced[high_rating, 1], c='green',
                    s=1, label='5')
    plt.scatter(dataset_reduced[med_rating, 0], dataset_reduced[med_rating, 1],  c='blue',
                    s=1, label='4')
    plt.scatter(dataset_reduced[low_rating, 0], dataset_reduced[low_rating, 1],  c='red',
                    s=1, label='3')

    plt.title(title)
    plt.show()

# Visualize the 2D embeddings for each text feature and method
plot_embeddings(name_bow_2d, 'Name - Bag of Words')
plot_embeddings(name_d2v_2d, 'Name - Doc2Vec')
plot_embeddings(name_glove_2d, 'Name - GloVe')

plot_embeddings(authors_bow_2d, 'Authors - Bag of Words')
plot_embeddings(authors_d2v_2d, 'Authors - Doc2Vec')
plot_embeddings(authors_glove_2d, 'Authors - GloVe')

plot_embeddings(desc_bow_2d, 'Description - Bag of Words')
plot_embeddings(desc_d2v_2d, 'Description - Doc2Vec')
plot_embeddings(desc_glove_2d, 'Description - GloVe')


# **Preprocessing**

In [None]:
# Load Doc2Vec features
name_doc2vec = pd.read_csv("project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col=False, delimiter=',', header=None)
authors_doc2vec = pd.read_csv("project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col=False, delimiter=',', header=None)
desc_doc2vec = pd.read_csv("project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col=False, delimiter=',', header=None)

name_doc2vec_matrix= name_doc2vec.values
authors_doc2vec_matrix = authors_doc2vec.values
desc_doc2vec_matrix= desc_doc2vec.values

In [None]:
# Handle missing values
train = train.fillna('')

# Extract the numeric features
numeric_features = train[['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']]

# Scale the numeric features
scaler = StandardScaler()
scaled_numeric_features = scaler.fit_transform(numeric_features)

# One-hot encode categorical features (Publisher, Language)
cat_features = train[['Publisher']]
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_cat_features = encoder.fit_transform(cat_features)


In [None]:
# Combine all preprocessed features
preprocessed_train_data_combined = hstack([
    scaled_numeric_features,
    encoded_cat_features,
    name_doc2vec_matrix, authors_doc2vec_matrix, desc_doc2vec_matrix
])
# Extract the target variable
target = train['rating_label']

In [None]:
# Convert the sparse matrix to a numpy array
train_data = preprocessed_train_data_combined.toarray()


# **Spiliting dataset to training set and validation set**

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_data, target, test_size=0.2, random_state=42)


# **Preprocess for test data**

In [None]:
# Load Doc2Vec features
name_doc2vec_test = pd.read_csv("project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv", index_col=False, delimiter=',', header=None)
authors_doc2vec_test = pd.read_csv("project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv", index_col=False, delimiter=',', header=None)
desc_doc2vec_test = pd.read_csv("project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv", index_col=False, delimiter=',', header=None)

test = test.fillna('')

# Convert Doc2Vec DataFrames to sparse matrices
name_doc2vec_matrix_test = name_doc2vec_test.values
authors_doc2vec_matrix_test = authors_doc2vec_test.values
desc_doc2vec_matrix_test = desc_doc2vec_test.values

numeric_features_test = test[['PublishYear', 'PublishMonth', 'PublishDay', 'pagesNumber']]
cat_features_test = test[['Publisher']]

scaled_numeric_features_test = scaler.transform(numeric_features_test)
encoded_cat_features_test = encoder.transform(cat_features_test)

preprocessed_test_data_combined = hstack([
    scaled_numeric_features_test,
    encoded_cat_features_test,
    name_doc2vec_matrix_test, authors_doc2vec_matrix_test, desc_doc2vec_matrix_test
])

test_data = preprocessed_test_data_combined.toarray()


# **Zero R baseline**

In [None]:
ids = list(range(1, 5767)) 

# predictions are 4.0 for all labels
predictions = [4.0] * len(ids)

# Create a DataFrame
submission = pd.DataFrame({
    'Id': ids,
    'rating_label': predictions
})

# Write the DataFrame to a CSV file
submission.to_csv('results/zero-r baseline.csv', index=False)


# **LogisticRegression base**

In [None]:
# Create a Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)

# Perform 5-fold cross-validation on the training data and print the mean accuracy
scores = cross_val_score(logistic_model, x_train, y_train, cv=5)
print("Cross-validated accuracy:", scores.mean())

# Fit the model on the whole training data
logistic_model.fit(x_train, y_train)

# Make predictions on the validation set
y_pred = logistic_model.predict(x_val)

# Calculate and print the accuracy on the validation set
accuracy = accuracy_score(y_val, y_pred)
print("Validation set accuracy:", accuracy)

# Print the classification report for the validation set
report = classification_report(y_val, y_pred)
print("Classification report:\n", report)

## **Tune hyperparameters**

In [None]:
# Define the hyperparameters to be tuned
param_grid = {
    'C': np.logspace(-4, 4, 20),  # Inverse of regularization strength
    'penalty': ['l1', 'l2'],      # Regularization type: L1 or L2
    'solver': ['liblinear']       # Solver to be used for L1 and L2 penalties
}

# Create the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Create the GridSearchCV object with cross-validation
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV object on the training data
grid_search.fit(x_train, y_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search.best_params_)

# Print the best score (accuracy) achieved with the best hyperparameters
print("Best accuracy:", grid_search.best_score_)

# Retrieve the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
y_pred = best_model.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy on validation set:", accuracy)
report = classification_report(y_val, y_pred)
print("Classification report:\n", report)


In [None]:
cm = confusion_matrix(y_val, y_pred)
# Normalize by row (actual classes)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

disp = ConfusionMatrixDisplay(confusion_matrix=cm_norm, 
                              display_labels=['3.0', '4.0', '5.0'])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
best_MLR = LogisticRegression(C=0.615848211066026, penalty='l2', solver='liblinear')
best_MLR.fit(np.concatenate([x_train, x_val]), np.concatenate([y_train, y_val])) 
final_p = best_MLR.predict(test_data)
# Convert all numbers in the array to integers
predictions_int = np.round(final_p).astype(float)

# Create a DataFrame from the converted prediction array
predictions_df = pd.DataFrame(predictions_int, columns=['rating_label'])

# Add an ID column to the DataFrame
predictions_df.insert(0, 'ID', range(1, len(predictions_df) + 1))
predictions_df.to_csv("results/MLR.csv", index = False)

# **RandomForest**

## **randomForest base**

In [None]:
# Create a Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model using the training data
rf.fit(x_train, y_train)

# Make predictions on the validation set
y_pred = rf.predict(x_val)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
report = classification_report(y_val, y_pred)
print("Classification report:\n", report)


## **randomForest + grid search**

In [None]:
# Define the parameter
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2']
}

# Create a RandomForestClassifier
rf = RandomForestClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the data
grid_search.fit(x_train, y_train)

best_rf = grid_search.best_estimator_
# Get the best parameters
best_params_rf = grid_search.best_params_
print("Best parameters: ", best_params_rf)

y_pred = best_rf.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy on validation set:", accuracy)

report = classification_report(y_val, y_pred)
print("Classification report:\n", report)


In [None]:
# Convert the grid search results into a pandas DataFrame
results = pd.DataFrame(grid_search.cv_results_)
subset = results[['mean_fit_time','params','mean_test_score']]
subset

In [None]:
cm = confusion_matrix(y_val, y_pred)
# Normalize by row (actual classes)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

disp = ConfusionMatrixDisplay(confusion_matrix=cm_norm, 
                              display_labels=['3.0', '4.0', '5.0'])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
best_RF = RandomForestClassifier(max_features= 'sqrt', n_estimators= 200)
best_RF.fit(np.concatenate([x_train, x_val]), np.concatenate([y_train, y_val])) 
final_p = best_RF.predict(test_data)
# Convert all numbers in the array to integers
predictions_int = np.round(final_p).astype(float)

# Create a DataFrame from the converted prediction array
predictions_df = pd.DataFrame(predictions_int, columns=['rating_label'])

# Add an ID column to the DataFrame
predictions_df.insert(0, 'ID', range(1, len(predictions_df) + 1))
predictions_df.to_csv("results/RF.csv", index = False)

# **SVM**

In [None]:
# Create a SVM with linear kernel
svm = SVC(kernel='linear')


# Train the classifier
svm.fit(x_train, y_train)

# Make predictions
y_pred = svm.predict(x_val)

# Compute accuracy
accuracy = accuracy_score(y_val, y_pred)
print('Accuracy:', accuracy)

report = classification_report(y_val, y_pred)
print("Classification report:\n", report)

In [None]:
cm = confusion_matrix(y_val, y_pred)
# Normalize by row (actual classes)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

disp = ConfusionMatrixDisplay(confusion_matrix=cm_norm, 
                              display_labels=['3.0', '4.0', '5.0'])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
# Create a SVM with poly kernel
svm_polynomial = SVC(kernel='poly')

# Train the classifier
svm_polynomial.fit(x_train, y_train)

# Make predictions
y_pred = svm_polynomial.predict(x_val)

# Compute accuracy
accuracy = accuracy_score(y_val, y_pred)
print('Accuracy:', accuracy)

report = classification_report(y_val, y_pred)
print("Classification report:\n", report)

In [None]:
cm = confusion_matrix(y_val, y_pred)
# Normalize by row (actual classes)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

disp = ConfusionMatrixDisplay(confusion_matrix=cm_norm, 
                              display_labels=['3.0', '4.0', '5.0'])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
# Create a SVM with rbf kernel
svm_rbf = SVC(kernel='rbf')

# Train the classifier
svm_rbf.fit(x_train, y_train)

# Make predictions
y_pred = svm_rbf.predict(x_val)

# Compute accuracy
accuracy = accuracy_score(y_val, y_pred)
print('Accuracy:', accuracy)

report = classification_report(y_val, y_pred)
print("Classification report:\n", report)

In [None]:
cm = confusion_matrix(y_val, y_pred)
# Normalize by row (actual classes)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

disp = ConfusionMatrixDisplay(confusion_matrix=cm_norm, 
                              display_labels=['3.0', '4.0', '5.0'])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
# Define the parameter ranges
param_grid = {'C': [0.1, 1, 10, 100, 1000]}

# Create a SVM with rbf kernel
rbfsvm = SVC(kernel='rbf')

# Grid search
grid = GridSearchCV(rbfsvm, param_grid, refit=True, verbose=2, cv=5)  # You can adjust the level of verbosity and the cv parameter which stands for cross-validation splitting strategy

# Fitting the model for grid search
grid.fit(x_train, y_train)

# Print best parameter after tuning
print(grid.best_params_)

# Print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

grid_predictions = grid.predict(x_val)

# Compute accuracy
accuracy = accuracy_score(y_val, grid_predictions)
print('Accuracy:', accuracy)

report = classification_report(y_val, grid_predictions)
print("Classification report:\n", report)

In [None]:
best_svm = SVC(kernel='rbf', C=1)
best_svm.fit(np.concatenate([x_train, x_val]), np.concatenate([y_train, y_val])) 
final_p = best_svm.predict(test_data)
# Convert all numbers in the array to integers
predictions_int = np.round(final_p).astype(float)

# Create a DataFrame from the converted prediction array
predictions_df = pd.DataFrame(predictions_int, columns=['rating_label'])

# Add an ID column to the DataFrame
predictions_df.insert(0, 'ID', range(1, len(predictions_df) + 1))
predictions_df.to_csv("results/svm.csv", index = False)

# **Stacking**

In [None]:
level0 = list()
level0.append(('svm', SVC(C=1, kernel='rbf')))
level0.append(('rf', RandomForestClassifier(max_features= 'sqrt', n_estimators= 200)))

level1 = LogisticRegression(C=0.615848211066026, penalty='l2', solver='liblinear')
stackmodel = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

# Fit the model on all available data
stackmodel.fit(x_train, y_train)

# Make a prediction for one example
y_pred = stackmodel.predict(x_val)


# Compute accuracy
accuracy = accuracy_score(y_val, y_pred)
print('Accuracy:', accuracy)

report = classification_report(y_val, y_pred)
print("Classification report:\n", report)

In [None]:
stackmodel.fit(np.concatenate([x_train, x_val]), np.concatenate([y_train, y_val])) 
final_p = stackmodel.predict(test_data)
# Convert all numbers in the array to integers
predictions_int = np.round(final_p).astype(float)

# Create a DataFrame from the converted prediction array
predictions_df = pd.DataFrame(predictions_int, columns=['rating_label'])

# Add an ID column to the DataFrame
predictions_df.insert(0, 'ID', range(1, len(predictions_df) + 1))
predictions_df.to_csv("results/stackmodel.csv", index = False)

# **AutoML this is just for comparing result, not counted as one of our models**

In [None]:
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
# code in this cell is about training the model from AutoML, we have saved the pre-trained model in /AutogluonModels
train_data = TabularDataset('project_data_files/book_rating_train.csv')
test_data = TabularDataset('project_data_files/book_rating_test.csv')

predictor = TabularPredictor(label='rating_label').fit(train_data=train_data, presets='best_quality')

In [None]:
predictions_int = np.round(predictions).astype(float)

# Create a DataFrame from the converted prediction array
predictions_df = pd.DataFrame(predictions_int, columns=['rating_label'])

# Add an ID column to the DataFrame
predictions_df.insert(0, 'ID', range(1, len(predictions_df) + 1))
predictions_df.to_csv("results/automl.csv", index = False)