# Task 1.1 Naïve Bayes
## Machine Learning

Diberikan sebuah Trainset berupa himpunan data berisi 160 objek data yang memiliki 7 atribut input (age, workclass, education, marital-status, occupation, relationship, hours-per-week) dan 1 output (label kelas income) yang memiliki 2 kelas/label (>50K, dan <=50K). Bangunlah sebuah sistem klasifikasi menggunakan metode Naïve Bayes untuk menentukan kelas/label data testing dalam Testset. Sistem membaca masukan file TrainsetTugas1ML.csv  dan TestsetTugas1ML.csv dan mengeluarkan output berupa file TebakanTugas1ML.csv berupa satu kolom berisi 40 baris yang menyatakan kelas/label baris yang bersesuaian pada file TestsetTugas1ML.csv.

### Data Preparation

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv('TrainsetTugas1ML.csv')
data = data.drop(columns=['id'])

data_train, data_validation = train_test_split(data, test_size = 0.3, random_state = 0)

#### Data Train

In [2]:
data_train.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,hours-per-week,income
118,young,Private,Bachelors,Married-civ-spouse,Exec-managerial,normal,>50K
95,adult,Private,Some-college,Never-married,Exec-managerial,normal,<=50K
55,young,Private,Bachelors,Married-civ-spouse,Prof-specialty,normal,>50K
109,young,Private,Bachelors,Never-married,Prof-specialty,normal,>50K
18,adult,Private,Some-college,Married-civ-spouse,Prof-specialty,normal,>50K


#### Data Validation

In [3]:
data_validation

Unnamed: 0,age,workclass,education,marital-status,occupation,hours-per-week,income
110,adult,Local-gov,Some-college,Married-civ-spouse,Prof-specialty,normal,>50K
112,adult,Private,Bachelors,Married-civ-spouse,Prof-specialty,normal,>50K
143,young,Private,Bachelors,Married-civ-spouse,Exec-managerial,normal,>50K
7,adult,Private,Bachelors,Married-civ-spouse,Prof-specialty,normal,>50K
44,young,Private,HS-grad,Never-married,Craft-repair,low,<=50K
101,adult,Private,Bachelors,Married-civ-spouse,Craft-repair,normal,>50K
122,adult,Private,Some-college,Married-civ-spouse,Craft-repair,normal,>50K
66,young,Private,HS-grad,Married-civ-spouse,Exec-managerial,normal,>50K
85,young,Private,Bachelors,Divorced,Exec-managerial,normal,>50K
86,young,Private,HS-grad,Married-civ-spouse,Craft-repair,normal,>50K


#### Data Test

In [4]:
data_test = pd.read_csv('TestsetTugas1ML.csv')
data_test = data_test.drop(columns=['id','relationship'])
data_test.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,hours-per-week
0,young,Private,HS-grad,Never-married,Craft-repair,normal
1,young,Private,Bachelors,Divorced,Exec-managerial,normal
2,young,Private,Bachelors,Married-civ-spouse,Prof-specialty,normal
3,adult,Private,Some-college,Divorced,Prof-specialty,normal
4,young,Private,HS-grad,Married-civ-spouse,Exec-managerial,many


### Create the Model

In [5]:
target = 'income'
features = data.columns[data.columns != target]
classes = data[target].unique()

In [6]:
probabilities = {}
kelas_probs = {}

for output_kelas in classes:
    data2 = data_train[data_train[target] == output_kelas][features]
    attribute_probs = {}
    occurrences = len(data2)
    for kolom in data2.columns:
        label_attr = {}
        for value,count in data2[kolom].value_counts().iteritems():
            prob = count/occurrences
            label_attr[value] = prob
        attribute_probs[kolom] = label_attr
    probabilities[output_kelas] = attribute_probs
    kelas_probs[output_kelas] = len(data2)/len(data_train)

In [7]:
kelas_probs

{'>50K': 0.75, '<=50K': 0.25}

In [8]:
probabilities

{'>50K': {'age': {'young': 0.5476190476190477,
   'adult': 0.44047619047619047,
   'old': 0.011904761904761904},
  'workclass': {'Private': 0.8690476190476191,
   'Local-gov': 0.07142857142857142,
   'Self-emp-not-inc': 0.05952380952380952},
  'education': {'Bachelors': 0.5238095238095238,
   'Some-college': 0.23809523809523808,
   'HS-grad': 0.23809523809523808},
  'marital-status': {'Married-civ-spouse': 0.8928571428571429,
   'Never-married': 0.05952380952380952,
   'Divorced': 0.047619047619047616},
  'occupation': {'Exec-managerial': 0.40476190476190477,
   'Prof-specialty': 0.35714285714285715,
   'Craft-repair': 0.23809523809523808},
  'hours-per-week': {'normal': 0.9523809523809523,
   'low': 0.03571428571428571,
   'many': 0.011904761904761904}},
 '<=50K': {'age': {'adult': 0.5,
   'young': 0.4642857142857143,
   'old': 0.03571428571428571},
  'workclass': {'Private': 0.8214285714285714,
   'Self-emp-not-inc': 0.14285714285714285,
   'Local-gov': 0.03571428571428571},
  'educa

#### Naive Bayes Function

In [9]:
def naive_bayes(data, probabilities, output_prob):
    predicted = []
    output_labels = list(output_prob.keys())
    for idx in data.index:
        prod_0 = output_prob[output_labels[0]]
        prod_1 = output_prob[output_labels[1]]
        for feature in features:
            prod_0 *= probabilities[output_labels[0]][feature][data[feature].loc[idx]]
            prod_1 *= probabilities[output_labels[1]][feature][data[feature].loc[idx]]
        
        #Predict the class, >50K or <=50K
        if prod_0 > prod_1:
            predicted.append(output_labels[0])
        else:
            predicted.append(output_labels[1])
    
    return predicted

### Accuracy

In [10]:
from sklearn.metrics import accuracy_score

valid_predict = naive_bayes(data_validation, probabilities, kelas_probs)
akurasi = accuracy_score(valid_predict, data_validation['income'])*100
print("Akurasi terhadap data validasi: {}%".format(akurasi))

Akurasi terhadap data validasi: 83.33333333333334%


In [13]:
data_predict = naive_bayes(data, probabilities, kelas_probs)
akurasi = accuracy_score(data_predict, data['income'])*100
print("Akurasi terhadap data validasi: {}%".format(akurasi))

Akurasi terhadap data validasi: 85.625%


### Classification of Data Test Prediction

In [11]:
test_predict = naive_bayes(data_test, probabilities, kelas_probs)
test_predict

['<=50K',
 '>50K',
 '>50K',
 '>50K',
 '<=50K',
 '>50K',
 '>50K',
 '<=50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '<=50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '<=50K',
 '<=50K',
 '<=50K',
 '<=50K',
 '>50K',
 '<=50K',
 '>50K',
 '<=50K',
 '>50K',
 '>50K',
 '>50K']