In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [6]:
# Handling data
# Reading in training data
train = pd.read_csv("input/goodreads_train.csv")

# Reading in test data
test = pd.read_csv("input/goodreads_test.csv")

# Using train_test_split on test set to further split
# into test and validation tests
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:


# Splitting the dataset into training, validation, and testing sets
# X_train, X_temp, y_train, y_temp = train_test_split(df['review'], df['rating'], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Setting up the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

# Parameters for Grid Search
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'clf__alpha': (1e-2, 1e-3)
}

# Performing grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_val, y_val)

# Best parameters set
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print(f"\t{param_name}: {best_parameters[param_name]}")

# Applying the best model to the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy on Test Data:", accuracy)
print("\nClassification Report on Test Data:\n", report)