# Task 1.1 Naïve Bayes
## Machine Learning

Diberikan sebuah Trainset berupa himpunan data berisi 160 objek data yang memiliki 7 atribut input (age, workclass, education, marital-status, occupation, relationship, hours-per-week) dan 1 output (label kelas income) yang memiliki 2 kelas/label (>50K, dan <=50K). Bangunlah sebuah sistem klasifikasi menggunakan metode Naïve Bayes untuk menentukan kelas/label data testing dalam Testset. Sistem membaca masukan file TrainsetTugas1ML.csv  dan TestsetTugas1ML.csv dan mengeluarkan output berupa file TebakanTugas1ML.csv berupa satu kolom berisi 40 baris yang menyatakan kelas/label baris yang bersesuaian pada file TestsetTugas1ML.csv.

### Data Preparation

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv('TrainsetTugas1ML.csv')
data = data.drop(columns=['id'])

data_train, data_validation = train_test_split(data, test_size = 0.3, random_state = 0)

#### Data Train

In [2]:
data_train.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,hours-per-week,income
118,young,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,normal,>50K
95,adult,Private,Some-college,Never-married,Exec-managerial,Not-in-family,normal,<=50K
55,young,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,normal,>50K
109,young,Private,Bachelors,Never-married,Prof-specialty,Own-child,normal,>50K
18,adult,Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,normal,>50K


#### Data Validation

In [3]:
data_validation.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,hours-per-week,income
110,adult,Local-gov,Some-college,Married-civ-spouse,Prof-specialty,Husband,normal,>50K
112,adult,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,normal,>50K
143,young,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,normal,>50K
7,adult,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,normal,>50K
44,young,Private,HS-grad,Never-married,Craft-repair,Own-child,low,<=50K


### Create the Model

In [4]:
target = 'income'
features = data.columns[data.columns != target]
classes = data[target].unique()

In [5]:
probabilities = {}
kelas_probs = {}

for output_kelas in classes:
    data2 = data[data[target] == output_kelas][features]
    attribute_probs = {}
    occurrences = len(data2)
    for kolom in data2.columns:
        label_attr = {}
        for value,count in data2[kolom].value_counts().iteritems():
            prob = count/occurrences
            label_attr[value] = prob
        attribute_probs[kolom] = label_attr
    probabilities[output_kelas] = attribute_probs
    kelas_probs[output_kelas] = len(data2)/len(data)

In [6]:
kelas_probs

{'>50K': 0.75, '<=50K': 0.25}

In [7]:
probabilities

{'>50K': {'age': {'young': 0.55,
   'adult': 0.44166666666666665,
   'old': 0.008333333333333333},
  'workclass': {'Private': 0.875,
   'Local-gov': 0.06666666666666667,
   'Self-emp-not-inc': 0.058333333333333334},
  'education': {'Bachelors': 0.5416666666666666,
   'HS-grad': 0.23333333333333334,
   'Some-college': 0.225},
  'marital-status': {'Married-civ-spouse': 0.9,
   'Never-married': 0.058333333333333334,
   'Divorced': 0.041666666666666664},
  'occupation': {'Exec-managerial': 0.39166666666666666,
   'Prof-specialty': 0.3416666666666667,
   'Craft-repair': 0.26666666666666666},
  'relationship': {'Husband': 0.8916666666666667,
   'Not-in-family': 0.09166666666666666,
   'Own-child': 0.016666666666666666},
  'hours-per-week': {'normal': 0.9666666666666667,
   'low': 0.025,
   'many': 0.008333333333333333}},
 '<=50K': {'age': {'young': 0.5, 'adult': 0.475, 'old': 0.025},
  'workclass': {'Private': 0.8, 'Self-emp-not-inc': 0.175, 'Local-gov': 0.025},
  'education': {'HS-grad': 0.

#### Naive Bayes Function

In [8]:
def naive_bayes(data, probabilities, output_prob):
    predicted = []
    output_labels = list(output_prob.keys())
    for idx in data.index:
        prod_0 = output_prob[output_labels[0]]
        prod_1 = output_prob[output_labels[1]]
        for feature in features:
            prod_0 *= probabilities[output_labels[0]][feature][data[feature].loc[idx]]
            prod_1 *= probabilities[output_labels[1]][feature][data[feature].loc[idx]]
        
        #Predict the class, >50K or <=50K
        if prod_0 > prod_1:
            predicted.append(output_labels[0])
        else:
            predicted.append(output_labels[1])
    
    return predicted

### Accuracy

In [9]:
from sklearn.metrics import accuracy_score

valid_predict = naive_bayes(data_validation, probabilities, kelas_probs)
akurasi = accuracy_score(valid_predict, data_validation['income'])*100
print("Akurasi terhadap data validasi: {}%".format(akurasi))

Akurasi terhadap data validasi: 85.41666666666666%


In [10]:
data_predict = naive_bayes(data, probabilities, kelas_probs)
akurasi = accuracy_score(data_predict, data['income'])*100
print("Akurasi terhadap data train: {}%".format(akurasi))

Akurasi terhadap data train: 83.125%


### Classification of Data Test Prediction

#### Data Test

In [11]:
data_test = pd.read_csv('TestsetTugas1ML.csv')
data_test = data_test.drop(columns=['id'])
data_test.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,hours-per-week
0,young,Private,HS-grad,Never-married,Craft-repair,Not-in-family,normal
1,young,Private,Bachelors,Divorced,Exec-managerial,Not-in-family,normal
2,young,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,normal
3,adult,Private,Some-college,Divorced,Prof-specialty,Not-in-family,normal
4,young,Private,HS-grad,Married-civ-spouse,Exec-managerial,Husband,many


#### Prediction

In [12]:
test_predict = naive_bayes(data_test, probabilities, kelas_probs)
test_predict

['<=50K',
 '<=50K',
 '>50K',
 '<=50K',
 '>50K',
 '>50K',
 '<=50K',
 '<=50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '<=50K',
 '>50K',
 '>50K',
 '>50K',
 '<=50K',
 '>50K',
 '<=50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '>50K',
 '<=50K',
 '<=50K',
 '<=50K',
 '>50K',
 '>50K',
 '<=50K',
 '>50K',
 '<=50K',
 '>50K',
 '>50K',
 '>50K']