# Tubes 2 AI


# Anggota Kelompok
## 13521087 Razzan Daksana Yoni
## 13521089 Kenneth Ezekiel Suprantoni
## 13521095 Muhammad Aji Wibisono
## 13521101 Arsa Izdihar Islam

In [77]:
from abc import ABC, abstractmethod
import pickle
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif
import math
from collections import Counter
import pandas as pd

In [78]:
# Data Preparation

df = pd.read_csv("data/data_train.csv")

# Numeric Attributes
numerical_columns = [
    'battery_power',
    'clock_speed',
    'fc',
    'int_memory',
    'm_dep',
    'mobile_wt',
    'n_cores',
    'pc',
    'px_height',
    'px_width',
    'ram',
    'sc_h',
    'sc_w',
    'talk_time'
]

# Ordinal Attributes
ordinal_columns = [
    'price_range'
]

# Nominal Attributes
nominal_columns = [c for c in df.columns if c not in numerical_columns and c not in ordinal_columns]

print(df['price_range'].unique())

[1 2 0 3]


In [79]:
nominal_columns

['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']

In [80]:
# check for nominal columns (boolean columns) using contingency table
from scipy.stats import chi2_contingency

def show_chi_contingency(col):
    print("column:", col)
    contingency = pd.crosstab(index=df[col], columns=df['price_range'], normalize='index')
    res = chi2_contingency(contingency)
    print("p =", res[1])
    return contingency

In [81]:
show_chi_contingency('blue')

column: blue
p = 0.9999306682714457


price_range,0,1,2,3
blue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.265162,0.265162,0.242595,0.22708
1,0.24602,0.243126,0.250362,0.260492


In [82]:
show_chi_contingency('dual_sim')

column: dual_sim
p = 0.9999360384691366


price_range,0,1,2,3
dual_sim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24569,0.25431,0.264368,0.235632
1,0.265625,0.254261,0.228693,0.25142


In [83]:
show_chi_contingency('four_g')

column: four_g
p = 0.9999600883636923


price_range,0,1,2,3
four_g,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.253799,0.24924,0.262918,0.234043
1,0.257412,0.25876,0.231806,0.252022


In [84]:
show_chi_contingency('three_g')

column: three_g
p = 0.9997915321532349


price_range,0,1,2,3
three_g,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.295522,0.223881,0.241791,0.238806
1,0.243192,0.26385,0.247887,0.24507


In [85]:
show_chi_contingency('touch_screen')

column: touch_screen
p = 0.99993024036726


price_range,0,1,2,3
touch_screen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.25035,0.239161,0.26014,0.25035
1,0.261314,0.270073,0.232117,0.236496


In [86]:
show_chi_contingency('wifi')

column: wifi
p = 0.9999157156926507


price_range,0,1,2,3
wifi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.263083,0.257426,0.256011,0.223479
1,0.248196,0.251082,0.236652,0.264069


In [87]:
from itertools import product
# ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
boolean_columns = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
combinations = list(product([True, False], repeat=len(boolean_columns)))

for combination in combinations:
    combination_columns = []
    for i, column in enumerate(combination):
        if column:
            combination_columns.append(boolean_columns[i])
            # result_df 
    df['combined_booleans'] = df[combination_columns].all(axis=1)
    print(combination_columns)
    show_chi_contingency('combined_booleans')

df = df.drop('combined_booleans', axis=1)

['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
column: combined_booleans
p = 0.990209773843712
['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen']
column: combined_booleans
p = 0.996055810961424
['blue', 'dual_sim', 'four_g', 'three_g', 'wifi']
column: combined_booleans
p = 0.9969576817681661
['blue', 'dual_sim', 'four_g', 'three_g']
column: combined_booleans
p = 0.9995504359172259
['blue', 'dual_sim', 'four_g', 'touch_screen', 'wifi']
column: combined_booleans
p = 0.990209773843712
['blue', 'dual_sim', 'four_g', 'touch_screen']
column: combined_booleans
p = 0.996055810961424
['blue', 'dual_sim', 'four_g', 'wifi']
column: combined_booleans
p = 0.9969576817681661
['blue', 'dual_sim', 'four_g']
column: combined_booleans
p = 0.9995504359172259
['blue', 'dual_sim', 'three_g', 'touch_screen', 'wifi']
column: combined_booleans
p = 0.9951136068203249
['blue', 'dual_sim', 'three_g', 'touch_screen']
column: combined_booleans
p = 0.9990435989859693
['blue', 'dual_sim', '

Berdasarkan test menggunakan chi squared tersebut terhadap fitur-fitur yang bertipe boolean, nilai p dari semua fitur lebih dari 0.99 sedangkan fitur yang baik untuk digunakan harusnya memiliki p value sekecil mungkin (sangat bagus apabila < 0.05). Maka dari itu, tidak ada fitur boolean yang dapat digunakan.

In [88]:
# Check for numerical features using anova
# Reference https://datascience.stackexchange.com/questions/74465/how-to-understand-anova-f-for-feature-selection-in-python-sklearn-selectkbest-w

anova_x = df[numerical_columns]
anova_y = df['price_range']

f_scores, p_scores = f_classif(anova_x, anova_y)

for column, f_score, p_score in zip(anova_x.columns, f_scores, p_scores):
    # if(p_score < 0.05):
    # if(p_score >= 0.05):
        print(f"{column}\t\t: p-score = {p_score}")

battery_power		: p-score = 1.199878164181477e-12
clock_speed		: p-score = 0.947345457999534
fc		: p-score = 0.9844593072179413
int_memory		: p-score = 0.02193652500628816
m_dep		: p-score = 0.4192414002698195
mobile_wt		: p-score = 0.0024343700642573767
n_cores		: p-score = 0.3031233402282094
pc		: p-score = 0.9475768095830291
px_height		: p-score = 3.634821713583304e-10
px_width		: p-score = 1.4591634777808455e-11
ram		: p-score = 0.0
sc_h		: p-score = 0.08589662330366395
sc_w		: p-score = 0.20583807634591528
talk_time		: p-score = 0.44294679598239306


Berdasarkan test menggunakan anova coefficient terhadap fitur-fitur yang bertipe numerik, nilai p dari dari fitur-fitur berikut lebih tinggi dari 0.05 dan kurang baik untuk digunakan

`[clock_speed, fc, m_dep, n_cores, pc, sc_h, sc_w, dan talk time]`

Sementara fitur-fitur berikut memiliki nilai p lebih rendah dari 0.05 dan baik untuk digunakan

`[battery_power, int_memory, mobile_wt, px_height, px_width, ram]`

In [89]:
# Detecting correlation between each other
df_high_correlation = df.drop('price_range', axis=1).corr().abs()

high_correlation_pairs = []
high_correlation_threshold = 0.4

for index, row in df_high_correlation.iterrows():
    for column, value in row.items():
        if value > high_correlation_threshold and column > index:
            high_correlation_pairs.append((index, column))

print(f"Pair of features with correlation to each other higher than threshold ({high_correlation_threshold}):")
print(high_correlation_pairs)
df_high_correlation

Pair of features with correlation to each other higher than threshold (0.4):
[('fc', 'pc'), ('four_g', 'three_g'), ('px_height', 'px_width'), ('sc_h', 'sc_w')]


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
battery_power,1.0,0.032533,0.012937,0.042946,0.005774,0.008559,0.016187,0.01701,0.011705,0.017015,0.00327,0.003244,0.019042,0.013218,0.022521,0.022134,0.038071,0.010436,0.023003,0.005806
blue,0.032533,1.0,0.038409,0.047221,0.02445,0.012109,0.045101,0.013229,0.029245,0.044718,0.003586,0.008351,0.030716,0.051306,0.006653,0.016512,0.029351,0.018932,0.02314,0.025846
clock_speed,0.012937,0.038409,1.0,0.008267,0.004902,0.039032,0.005417,0.023744,0.008879,0.011135,0.003518,0.016482,0.012462,0.022962,0.004114,0.020765,0.000566,0.046305,0.014256,0.016047
dual_sim,0.042946,0.047221,0.008267,1.0,0.033578,0.017518,0.02198,0.048199,0.016087,0.021266,0.022957,0.028301,0.016451,0.011901,0.008547,0.003648,0.047028,0.01856,0.001551,0.035774
fc,0.005774,0.02445,0.004902,0.033578,1.0,0.014253,0.041477,0.017144,0.027141,0.006261,0.644348,0.016191,0.050449,0.001307,0.010459,0.00881,0.003483,0.00702,0.004752,0.011708
four_g,0.008559,0.012109,0.039032,0.017518,0.014253,1.0,0.000638,0.037858,0.015844,0.024001,0.016605,0.051457,0.009528,0.000408,0.014643,0.015249,0.045681,0.595575,0.000143,0.026592
int_memory,0.016187,0.045101,0.005417,0.02198,0.041477,0.000638,1.0,0.041084,0.040759,0.041109,0.031596,0.026845,0.003778,0.007183,0.024976,0.005848,0.028856,0.008823,0.057298,0.036528
m_dep,0.01701,0.013229,0.023744,0.048199,0.017144,0.037858,0.041084,1.0,0.053034,0.020967,0.019481,0.008807,0.02339,0.006886,0.021827,0.023862,0.016902,0.031737,0.001893,0.024493
mobile_wt,0.011705,0.029245,0.008879,0.016087,0.027141,0.015844,0.040759,0.053034,1.0,0.006866,0.003947,0.009501,0.026278,0.04608,0.072128,0.026295,0.014507,0.019863,0.01019,0.007817
n_cores,0.017015,0.044718,0.011135,0.021266,0.006261,0.024001,0.041109,0.020967,0.006866,1.0,0.004117,0.006308,0.025477,0.007089,0.005188,0.007084,0.050324,0.003143,0.030234,0.009171


In [90]:
# Detecting correlation with target column
target_column = 'price_range'

# Negative or positive correlation only means the direction of the correlation
# What's important here is the magnitude, hence abs()
# TODO: Tweak threshold
low_correlation_threshold = 0.01

correlations = df.corr()[target_column].abs()
filtered_correlations = correlations[correlations < low_correlation_threshold].sort_values(ascending=True)

print(f"Feature with correlation to target lower than threshold ({low_correlation_threshold}):")
filtered_correlations_string = ', '.join(filtered_correlations.index)
filtered_correlations_string = "[" + filtered_correlations_string + ']'
print(filtered_correlations_string)

Feature with correlation to target lower than threshold (0.01):
[four_g, n_cores, m_dep, fc, pc]


In [91]:
# Selain feature selection, optimize juga yak, misal kaya normalisasi atau standarisasi, bisa cek pake naive bayesnya dulu aja
# Pokoknya biar improve efficiency ajah hehe

# Feature selection for KNN
# List dulu apa aja fiturnya, udah gitu tulis korelasinya disini
# fc sama clock speed jangan dipake (hapus), korelasinya mendekati 0, ilangin aja 
# ram, px_width, px_height, battery_power
# pake semua, hapus 2, pake 4, hapus 2 dan pake 4

# Feature selection for Naive Bayes

# initial template
# numerical_features = [
#     'battery_power',
#     'clock_speed',
#     'fc',
#     'int_memory',
#     'm_dep',
#     'mobile_wt',
#     'n_cores',
#     'pc',
#     'px_height',
#     'px_width',
#     'ram',
#     'sc_h',
#     'sc_w',
#     'talk_time'
# ]

# nominal_features = nominal_columns

In [92]:
### Kesimpulan feature selection
## fitur yang dapat dihilangkan
# Semua nominal feature di drop karena nilai p semua fitur terlalu tinggi
# ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']

# Feature dengan korelasi di bawah threshold
# [n_cores, m_dep, fc, pc]

# Fitur satu sama lain korelasinya cukup tinggi untuk digabung/dihilangkan
# [('px_height', 'px_width')] menjadi 'px_area'
# [('sc_w', 'sc_h')] menjadi 'sc_area'

# Fitur numerik yang bagus berdasarkan anova
# [battery_power, int_memory, mobile_wt, px_height, px_width, ram]
# Yang dapat tidak dipakai
# [clock_speed, fc, m_dep, n_cores, pc, sc_h, sc_w, dan talk time]

In [93]:
# Fitur akhir yang dipilih {TODO: Tweak chosen features}
df = pd.read_csv("data/data_train.csv")
# df['px_area'] = df['px_height'] * df['px_width']
# df['sc_area'] = df['sc_w'] * df['sc_h']

chosen_numerical_features = [
    'battery_power',
    'int_memory',
    'mobile_wt',
    'px_height',
    'px_width',
    'ram',
    
    # yang di bawah ini gak top priority tapi bisa coba untuk tweaking
    # 'clock_speed',
    # 'sc_h',
    # 'sc_w',
    # 'talk_time',
    # 'sc_area',

    # 'px_area', # ini ternyata malah nurunin akurasi
]

# Gak ada
chosen_nominal_features = [

]

In [94]:
# Check for duplicates just to be sure
print(f"Duplicates: {df.duplicated().sum()}")

Duplicates: 0


In [95]:
# Remove outlier {TODO: Tweak n_std}

# Bagusan pake rata - rata atau pake kuartil?
def remove_outliers(df_func,columns,n_std = 3):
    total_outliers = 0
    for col in columns:

        mean = df_func[col].mean()
        sd = df_func[col].std()
        
        top_limit = mean + (n_std * sd)
        bottom_limit = mean - (n_std * sd)

        upper_outliers = np.array(df_func[col] > top_limit)
        lower_outliers = np.array(df_func[col] < bottom_limit)
        total_outliers += upper_outliers.sum() + lower_outliers.sum()
        # print(f"Working on column {col}")
        # print(f"Bounds\t\t: {bottom_limit} and {top_limit}")
        # print(f"upper outliers\t: {upper_outliers.sum()}")
        # print(f"lower outliers\t: {lower_outliers.sum()}")
        # print(f"total outliers\t: {upper_outliers.sum() + lower_outliers.sum()}")
        # print("------------------------------------------")

        df_func = df_func[(df_func[col] >= bottom_limit) &
                          (df_func[col] <= top_limit)]
    print(f"Outliers: {total_outliers}")

    return df_func

df = remove_outliers(df, chosen_numerical_features, 3)

Outliers: 0


In [96]:
# Split data
#def train_test_split(dataframe, split_frac):
#    n_split = int(len(dataframe) * split_frac)
#    randomized = df.sample(frac=1)
#    return randomized[:n_split], randomized[n_split:]


# df_train, df_test = train_test_split(df, 0.8)
#x_train, x_test, y_train, y_test = train_test_split(df.drop('price_range', axis=1), df['price_range'], test_size=0.3, random_state=42)

x_train = df.drop('price_range', axis=1)
y_train = df['price_range']
df_validate = pd.read_csv("data/data_validation.csv")
x_test = df_validate.drop('price_range', axis=1)
y_test = df_validate['price_range']

In [97]:
class Model(ABC):
    @abstractmethod
    def fit(self, x_data: np.matrix, y_data: np.matrix):
        pass
    
    @abstractmethod
    def predict(self, x_data: np.matrix):
        pass
    
    def dump(self, filename: str):
        with open(filename, "wb") as f:
            pickle.dump(self, f)
    
    @staticmethod
    def load(filename: str):
        with open(filename, "rb") as f:
            return pickle.load(f)

In [98]:
class KNN(Model):
    def __init__(self, nominals, numericals, target, k=5):
        self.k = k
        self.numerics = numericals
        self.nominals = nominals
        self.target = target
        self.features_data = None
        self.target_data = None

    def fit(self, x_data, y_data):
        self.features_data = x_data
        self.target_data = y_data

    def predict(self, x_data):
        if self.features_data is None or self.target_data is None:
            raise Exception("Invalid data")

        predictions = []
        for i, row in x_data.iterrows():
            # Get the distance between the data
            ## Nominal
            nom_dist = np.sum(row[self.nominals] != self.features_data[self.nominals], axis=1)
            ## Numeric (using Euclidean)
            num_dist = np.sqrt(np.sum(np.square(row[self.numerics] - self.features_data[self.numerics]), axis=1))

            distances = nom_dist + num_dist

            # Get the k nearest neighbors
            neighbors = np.argsort(distances)[:self.k]

            # Get the most common class
            classes = Counter(self.target_data.iloc[neighbors])
            max_class = classes.most_common(1)[0][0]

            predictions.append([i, max_class])
        return pd.DataFrame(predictions, columns=['id', 'price_range'])

    def __str__(self):
        return "KNN(k={})".format(self.k)


In [107]:
knn = KNN(chosen_nominal_features, chosen_numerical_features, "price_range")
knn.fit(x_train, y_train)
df_pred = knn.predict(x_test)

print(df_pred)
accuracy = accuracy_score(y_test, df_pred['price_range'])
precision = precision_score(y_test, df_pred['price_range'], average='weighted')
recall = recall_score(y_test, df_pred['price_range'], average='weighted')

print("------------------------------------------------")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

      id  price_range
0      0            1
1      1            2
2      2            3
3      3            0
4      4            2
..   ...          ...
595  595            1
596  596            3
597  597            0
598  598            2
599  599            3

[600 rows x 2 columns]
------------------------------------------------
Accuracy: 0.925
Precision: 0.9253007631703358
Recall: 0.925


In [100]:
def calculate_probability(x, mean, std):
    """
    Calculate the probability of a value in a normal distribution.
    
    Parameters:
    - x: The value for which to calculate the probability.
    - mean: The mean of the normal distribution.
    - std: The standard deviation of the normal distribution.
    
    Returns:
    - probability: The probability of the given value in the distribution.
    """
    exponent = -((x - mean) ** 2) / (2 * std ** 2)
    coefficient = 1 / (math.sqrt(2 * math.pi) * std)
    probability = coefficient * math.exp(exponent)
    return probability

In [101]:
# Model Naive Bayes
class NaiveBayes(Model):

    def __init__(self, nominals, numericals):
        self.pv = {}
        self.table = {}
        self.table_numerical = {}
        self.nominal_features = nominals
        self.numerical_features = numericals
    
    def fit(self, x_data, y_data):
        # Get the probability P(Vi)
        target = 'price_range'
        for i in range(y_data.min(), y_data.max()+1):
            self.pv[i] = len(y_data[y_data == i])/len(y_data)

        data = x_data.join(y_data)

        # Get the probability of value a given vi for every nominal feature
        for feature in self.nominal_features:
            values = x_data[feature].unique()
            for j in values:
                j_data = data[data[feature] == j]
                for k in range(y_data.min(), y_data.max()+1):
                    k_data = j_data[j_data[target] == k]
                    self.table[(feature,j,k)] = len(k_data) / len(data) / self.pv[k]

        # Get the gaussian distribution of a value given vi for every numerical feature
        for feature in self.numerical_features:
            for k in range(y_data.min(), y_data.max() + 1):
                k_data = data[data[target] == k]
                if not k_data.empty:
                    mean = k_data[feature].mean()
                    std = k_data[feature].std()
                    self.table_numerical[(feature, k)] = {'mean': mean, 'std': std}


    def predict(self, x_data):
        predictions = []
        for i, row in x_data.iterrows():
            class_probability = []
            for k in range(min(self.pv.keys()), max(self.pv.keys()) + 1):
                p = self.pv[k]
                for feature in x_data.columns:
                    if feature in self.numerical_features:
                        value = row[feature]
                        mean = self.table_numerical[(feature, k)]['mean']
                        std = self.table_numerical[(feature, k)]['std']
                        p *= calculate_probability(value, mean, std)
                    else:
                        value = row[feature]
                        if (feature, value, k) in self.table:
                            p *= self.table[(feature, value, k)]

                class_probability.append(p)
            predictions.append([i, np.argmax(class_probability)])
        return pd.DataFrame(predictions, columns=['id', 'price_range'])

    
    def __str__(self):
        string = "P(V):\n"
        for key, value in self.pv.items():
            string += f"P(V{key}): {value}\n"
        string += "Table:\n"
        for key, value in self.table.items():
            string += f"{key[0]} P({key[1]} | {key[2]}): {value}\n"
        string += "Numerical Table:\n"
        for key, value in self.table_numerical.items():
            string += f"P({key[0]} | {key[1]}): {value}\n"
        return string
    

In [102]:
nb = NaiveBayes(chosen_nominal_features, chosen_numerical_features)
nb.fit(x_train, y_train)
print(str(nb))

df_pred = nb.predict(x_test)
print(df_pred)
accuracy = accuracy_score(y_test, df_pred['price_range'])
precision = precision_score(y_test, df_pred['price_range'], average='weighted')
recall = recall_score(y_test, df_pred['price_range'], average='weighted')

print("------------------------------------------------")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

P(V):
P(V0): 0.2557142857142857
P(V1): 0.2542857142857143
P(V2): 0.24642857142857144
P(V3): 0.24357142857142858
Table:
Numerical Table:
P(battery_power | 0): {'mean': 1123.0893854748604, 'std': 408.3168468849179}
P(battery_power | 1): {'mean': 1242.991573033708, 'std': 426.2944858442114}
P(battery_power | 2): {'mean': 1219.9797101449276, 'std': 441.2213652938581}
P(battery_power | 3): {'mean': 1368.1524926686218, 'std': 409.8500302132808}
P(int_memory | 0): {'mean': 31.782122905027933, 'std': 18.280034997621417}
P(int_memory | 1): {'mean': 32.24438202247191, 'std': 17.366451287504116}
P(int_memory | 2): {'mean': 29.785507246376813, 'std': 18.57058560774613}
P(int_memory | 3): {'mean': 34.058651026392965, 'std': 18.258988679187105}
P(mobile_wt | 0): {'mean': 141.24022346368716, 'std': 35.920131633007095}
P(mobile_wt | 1): {'mean': 141.21629213483146, 'std': 35.60211836970086}
P(mobile_wt | 2): {'mean': 141.768115942029, 'std': 34.252374271608275}
P(mobile_wt | 3): {'mean': 133.076246334

In [103]:
# Implementasi dengan library sklearn untuk perbandingan
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pandas as pd

#df_train = pd.read_csv("data/data_train.csv")
#df_validation = pd.read_csv("data/data_validation.csv")
a = x_train
a = a[chosen_numerical_features]
d = x_test
d = d[chosen_numerical_features]

print("#-1-KNN---------------------------------------------")
knn_library = KNeighborsClassifier() # TODO: Tweak

knn_library.fit(a, y_train)
pred = knn_library.predict(d)

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

print("#-2-Naive-Bayes-------------------------------------")
nb_library = GaussianNB()

nb_library.fit(a, y_train)
pred = nb_library.predict(d)

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

#-1-KNN---------------------------------------------
Accuracy: 0.925
Precision: 0.9253007631703358
Recall: 0.925
#-2-Naive-Bayes-------------------------------------
Accuracy: 0.7933333333333333
Precision: 0.7946776384842273
Recall: 0.7933333333333333


In [104]:
# For kaggle submission
test = pd.read_csv("data/test.csv")
knn = KNN(chosen_nominal_features, chosen_numerical_features, "price_range")
knn.fit(x_train, y_train)
prediction_knn = knn.predict(test)
prediction_knn.to_csv("submission_knn.csv", index=False)

nb = NaiveBayes(chosen_nominal_features, chosen_numerical_features)
nb.fit(x_train, y_train)
prediction_nb = nb.predict(test)
prediction_nb.to_csv("submission_nb.csv", index=False)