# Load Data (Init)

In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score

import joblib


all_features = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']
selected_features = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'mobile_wt', 'px_height', 'px_width', 'ram']
selected_features = ['battery_power', 'px_height', 'px_width', 'ram']
target = 'price_range'

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

def get_normalized_df(data, selected_columns, train=False):
    selected_data = data[selected_columns]
    normalized_data=None
    if (train):
      print("creating new scaler...")
      normalized_data = scaler.fit_transform(selected_data)
    else:
      print("using old scaler...")
      normalized_data = scaler.transform(selected_data)

    normalized_df = pd.DataFrame(normalized_data, columns=selected_columns)

    remaining_columns = data.drop(columns=selected_columns)
    result_df = pd.concat([normalized_df, remaining_columns], axis=1)

    return result_df

# Load the training data
data_train_path = '/content/data_train.csv'
data_train = pd.read_csv(data_train_path)
# data_train = discretize_features(data_train, selected_features, n_bins = 5, train = True)
data_train = get_normalized_df(data_train, selected_features, train=True)
data_validation_path = '/content/data_validation.csv'
data_validation = pd.read_csv(data_validation_path)
# data_validation = discretize_features(data_validation, selected_features, n_bins = 5)
data_validation = get_normalized_df(data_validation, selected_features)
data_test = '/content/test.csv'
data_test = pd.read_csv(data_test)
# data_test = discretize_features(data_test, selected_features, n_bins = 5)
data_test = get_normalized_df(data_test, selected_features)

creating new scaler...
using old scaler...
using old scaler...


# Naive Bayes Manual

In [None]:
import numpy as np

class GaussianNaiveBayes:
    def __init__(self):
        self.class_labels = None
        self.class_priors = None
        self.class_means = None
        self.class_variances = None
        self.gaussian_constant = None

    def fit(self, X, y):
        self.class_labels = np.unique(y)
        self.class_priors = np.zeros(len(self.class_labels))
        self.class_means = np.zeros((len(self.class_labels), X.shape[1]))
        self.class_variances = np.zeros((len(self.class_labels), X.shape[1]))

        for i, label in enumerate(self.class_labels):
            X_class = X[y == label]
            # Hitung P(y)
            self.class_priors[i] = X_class.shape[0] / X.shape[0]
            # Hitung Mean dari tiap kategori di kelas tersebut
            self.class_means[i, :] = np.mean(X_class, axis=0)
            # Hitung variance dari tiap feature di kelas tsb
            self.class_variances[i, :] = np.var(X_class, axis=0)

        self.gaussian_constant = (1/np.sqrt(2*(np.pi)*self.class_variances))

    def predict(self, X):
      predictions = []
      for values in X.values:
        predictions.append(self.predict_single(values))
      return predictions

    def predict_single(self, X):
        PX_y = self.gaussian_probability(X)
        PX_y = np.prod(PX_y,axis=1)
        Py_X = PX_y*self.class_priors
        return np.argmax(Py_X)

    def gaussian_probability(self, x):
      mean = self.class_means
      variances = self.class_variances
      b = np.exp(-np.square(x-mean)/(2*variances))
      return self.gaussian_constant*b

    def save(self, path):
      joblib.dump(self, path)

    def load(self, path):
      loaded_model = joblib.load(path)
      self.class_labels = loaded_model.class_labels
      self.class_priors = loaded_model.class_priors
      self.class_means = loaded_model.class_means
      self.class_variances = loaded_model.class_variances
      self.gaussian_constant = loaded_model.gaussian_constant


In [None]:
import pandas as pd
from sklearn.metrics import classification_report

import joblib

X_train = data_train[selected_features]
y_train = data_train['price_range']

X_validation = data_validation[selected_features]
y_validation = data_validation['price_range']

model = GaussianNaiveBayes()
model.fit(X_train, y_train)

predictions = model.predict(X_validation)

classification_rep = classification_report(predictions, data_validation['price_range'])
print("Classification Report:")
print(classification_rep)

# Extract accuracy using accuracy_score
accuracy = accuracy_score(data_validation['price_range'], predictions)
print(f"Accuracy: {accuracy:.4f}")

# Save the model
filename = f'naivebayes_ver{accuracy:.4f}.pkl'
model.save(filename)

#Naive Bayes Library

In [25]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

# Assuming 'price_range' is the target variable
X_train = data_train[selected_features]
y_train = data_train['price_range']

X_validation = data_validation[selected_features]
y_validation = data_validation['price_range']

# Create and train the Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
validation_predictions = nb_classifier.predict(X_validation)

# Evaluate the model
accuracy = accuracy_score(y_validation, validation_predictions)
print(f"Accuracy on validation set: {accuracy:.4f}")

classification_rep = classification_report(y_validation, validation_predictions)
print("Classification Report:")
print(classification_rep)

Accuracy on validation set: 0.7850
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       142
           1       0.67      0.65      0.66       144
           2       0.68      0.72      0.70       155
           3       0.91      0.88      0.89       159

    accuracy                           0.79       600
   macro avg       0.79      0.78      0.78       600
weighted avg       0.79      0.79      0.79       600



#Kaggle

In [27]:
# Load the test data
test_df = data_test

model_updated = GaussianNaiveBayes()
model_updated.load(filename)

X_test = test_df[selected_features]
predicted_price_ranges = model_updated.predict(X_test)

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'price_range': predicted_price_ranges
})

submission_file_path = "/content/submission.csv"
submission_df.to_csv(submission_file_path, index=False)

print(f"Submission file saved to {submission_file_path}")

Submission file saved to /content/submission.csv


### Compare with previous submission

In [18]:
import pandas as pd

path_to_submission_before = '/content/submission_CATBOST.csv'
path_to_submission = '/content/submission.csv'

submission_before = pd.read_csv(path_to_submission_before)
submission = pd.read_csv(path_to_submission)

if submission_before['id'].equals(submission['id']):

    differences = (submission_before['price_range'] != submission['price_range']).sum()
    total = len(submission_before)
    percentage_difference = (differences / total) * 100

    print(f'Total changes: {differences} out of {total} predictions ({percentage_difference:.2f}%)')
else:
    print('The IDs in both submission files do not match, or they are not in the same order.')

FileNotFoundError: ignored