In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
data = pd.read_csv ('/content/drive/MyDrive/ML 6/personality_dataset.csv')
data

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2895,3.0,No,7.0,6.0,No,6.0,6.0,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,,Yes,2.0,0.0,Introvert


In [5]:
# 1. Handle Missing Values (using the same approach as before)
print("Missing Values Sebelum:")
print(data.isnull().sum())

numerical_cols = data.select_dtypes(include=['float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

num_imputer = SimpleImputer(strategy='mean')
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

print("\nMissing Values Setelah:")
print(data.isnull().sum())

Missing Values Sebelum:
Time_spent_Alone             63
Stage_fear                   73
Social_event_attendance      62
Going_outside                66
Drained_after_socializing    52
Friends_circle_size          77
Post_frequency               65
Personality                   0
dtype: int64

Missing Values Setelah:
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64


In [6]:
# 2. Encode Categorical Features (including the target 'Personality')
# Identify all categorical columns again as imputation might change dtype slightly
categorical_cols_after_imputation = data.select_dtypes(include=['object']).columns
print("\nKolom Kategorikal Setelah Imputasi:", list(categorical_cols_after_imputation))

le = LabelEncoder()

# Encode all categorical columns, including the target 'Personality'
for col in categorical_cols_after_imputation:
    data[col] = le.fit_transform(data[col])

print("\nData Setelah Encoding (5 teratas):")
print(data.head())
print("\nHasil Encoding Personality:", data['Personality'].unique())


Kolom Kategorikal Setelah Imputasi: ['Stage_fear', 'Drained_after_socializing', 'Personality']

Data Setelah Encoding (5 teratas):
   Time_spent_Alone  Stage_fear  Social_event_attendance  Going_outside  \
0               4.0           0                      4.0            6.0   
1               9.0           1                      0.0            0.0   
2               9.0           1                      1.0            2.0   
3               0.0           0                      6.0            7.0   
4               3.0           0                      9.0            4.0   

   Drained_after_socializing  Friends_circle_size  Post_frequency  Personality  
0                          0                 13.0             5.0            0  
1                          1                  0.0             3.0            1  
2                          1                  5.0             2.0            1  
3                          0                 14.0             8.0            0  
4           

In [7]:
# Pisahkan fitur (X) dan label (y) from the preprocessed data
X = data.drop('Personality', axis=1)   # Semua kolom kecuali 'Personality'
y = data['Personality']                # Label/target

# Scaling fitur (karena Naive Bayes Bernoulli butuh data biner atau 0-1)
scaler = StandardScaler()
# Apply scaler on the numerical X after preprocessing
X = scaler.fit_transform(X)

# Membagi data menjadi training dan testing (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
# Inisialisasi dan training model Naive Bayes
model_nb = BernoulliNB()
model_nb.fit(X_train, y_train)

In [9]:
# Prediksi pada data test
y_pred_nb = model_nb.predict(X_test)
y_pred_nb

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,

In [10]:
y_test

Unnamed: 0,Personality
141,1
1557,0
485,1
1712,1
2250,1
...,...
435,0
676,1
237,1
1592,0


In [11]:
# Memeriksa antara hasil prediksi dan data aktual
df = pd.DataFrame({'Prediksi': y_pred_nb, 'Aktual': y_test})
df

Unnamed: 0,Prediksi,Aktual
141,1,1
1557,0,0
485,1,1
1712,1,1
2250,1,1
...,...,...
435,0,0
676,1,1
237,1,1
1592,0,0


In [12]:
# Evaluasi model Naive Bayes
print("===== Naive Bayes Classification Report =====")
print(classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_nb))

===== Naive Bayes Classification Report =====
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       302
           1       0.92      0.94      0.93       278

    accuracy                           0.93       580
   macro avg       0.93      0.93      0.93       580
weighted avg       0.93      0.93      0.93       580

Confusion Matrix:
 [[278  24]
 [ 17 261]]

Accuracy Score: 0.9293103448275862
