In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Load the dataset
data = pd.read_csv('/PJ2.csv')  # Replace with the correct path to your dataset

# Selecting the relevant features and target
X = data[['total_votes']]  # Feature matrix containing only the 'total_votes' as input
y = data['star_rating']    # Target variable containing star ratings

# Handling potential missing values just in case
X.fillna(X.mean(), inplace=True)  # Fill missing X values with the mean of the column
y.fillna(method='ffill', inplace=True)  # Forward fill to replace missing y values

# Convert labels to categorical (one-hot encoding)
y = to_categorical(y - 1)  # Subtract 1 to shift labels to 0-index for to_categorical function

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()  # Initialize the StandardScaler
X_train = scaler.fit_transform(X_train)  # Fit and transform training data
X_test = scaler.transform(X_test)  # Transform testing data based on training data scaling

# Model configuration
model = Sequential([
    Dense(64, input_dim=1, activation='relu'),  # Input layer with 64 neurons, takes 1 feature
    Dense(64, activation='relu'),  # Hidden layer with 64 neurons, relu activation
    Dense(5, activation='softmax')  # Output layer with 5 neurons (one for each class), softmax for multi-class classification
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Using adam optimizer and accuracy as the metric

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1, validation_split=0.1)  # Training the model with 50 epochs and 10 samples per batch


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)  # Fill missing X values with the mean of the column
  y.fillna(method='ffill', inplace=True)  # Forward fill to replace missing y values
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m2221/2221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 798us/step - accuracy: 0.6513 - loss: 1.0852 - val_accuracy: 0.6592 - val_loss: 0.9905
Epoch 2/50
[1m2221/2221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 737us/step - accuracy: 0.6582 - loss: 0.9963 - val_accuracy: 0.6596 - val_loss: 0.9943
Epoch 3/50
[1m2221/2221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 758us/step - accuracy: 0.6623 - loss: 0.9904 - val_accuracy: 0.6596 - val_loss: 0.9986
Epoch 4/50
[1m2221/2221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 750us/step - accuracy: 0.6613 - loss: 0.9898 - val_accuracy: 0.6596 - val_loss: 0.9886
Epoch 5/50
[1m2221/2221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 751us/step - accuracy: 0.6604 - loss: 0.9918 - val_accuracy: 0.6613 - val_loss: 0.9900
Epoch 6/50
[1m2221/2221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 735us/step - accuracy: 0.6556 - loss: 1.0043 - val_accuracy: 0.6617 - val_loss: 0.9902
Epoc

<keras.src.callbacks.history.History at 0x1e451367c50>

In [3]:

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)  # Evaluate model performance on the test set
print(f'Test Accuracy: {accuracy*100:.2f}%')  # Print the test accuracy percentage

# Predicting from the model (optional step)
predictions = model.predict(X_test)  # Generate predictions for the test set
predicted_classes = predictions.argmax(axis=1) + 1  # Find class with highest probability, adjust indexing

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Model Evaluation Repeated
loss, accuracy = model.evaluate(X_test, y_test)  # Evaluate the model again for demonstration
print(f'Test Accuracy: {accuracy:.2f}')  # Print the accuracy

# Execute predictions again for clarity
predictions = model.predict(X_test)  # Redo predictions for clarity
predicted_classes = predictions.argmax(axis=1) + 1  # Recalculate predicted classes

# Output evaluation metrics
print("Classification Report:")
print(classification_report(y_test.argmax(axis=1) + 1, predicted_classes))  # Classification report for further model evaluation

# Output confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test.argmax(axis=1) + 1, predicted_classes))  # Confusion matrix to see model performance per class


[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 656us/step - accuracy: 0.6631 - loss: 0.9768
Test Accuracy: 65.79%
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 735us/step
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step - accuracy: 0.6631 - loss: 0.9768
Test Accuracy: 0.66
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 620us/step
Classification Report:
              precision    recall  f1-score   support

           1       0.36      0.37      0.37       318
           2       0.00      0.00      0.00       218
           3       0.00      0.00      0.00       454
           4       0.00      0.00      0.00      1158
           5       0.67      0.98      0.80      4022

    accuracy                           0.66      6170
   macro avg       0.21      0.27      0.23      6170
weighted avg       0.46      0.66      0.54      6170

Confusion Matrix:
[[ 119    0    0    0  199]
 [  51    0    0    0  1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
