In [None]:
!pip install pandas scikit-learn streamlit




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [None]:
# Baca data
df = pd.read_csv('data.csv')
df

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
0,21.0,Female,1,5.89208,2.144395,7.32363,5.462224,Arts,ENTP
1,24.0,Female,1,2.48366,3.206188,8.06876,3.765012,Unknown,INTP
2,26.0,Female,1,7.02910,6.469302,4.16472,5.454442,Others,ESFP
3,30.0,Male,0,5.46525,4.179244,2.82487,5.080477,Sports,ENFJ
4,31.0,Female,0,3.59804,6.189259,5.31347,3.677984,Others,ISFP
...,...,...,...,...,...,...,...,...,...
43739,26.0,Male,1,8.88656,5.118399,8.48784,5.331942,Arts,ENTP
43740,29.0,Male,0,2.71470,6.851956,0.25519,3.623678,Unknown,ISFP
43741,25.0,Female,0,2.00982,6.544036,6.63911,6.201555,Technology,ISTJ
43742,23.0,Male,0,7.23553,4.876780,2.41671,4.654016,Arts,ENFP


# Understanding data

In [None]:
# Lihat 5 baris pertama
print(df.head())

    Age  Gender  Education  Introversion Score  Sensing Score  Thinking Score  \
0  21.0  Female          1             5.89208       2.144395         7.32363   
1  24.0  Female          1             2.48366       3.206188         8.06876   
2  26.0  Female          1             7.02910       6.469302         4.16472   
3  30.0    Male          0             5.46525       4.179244         2.82487   
4  31.0  Female          0             3.59804       6.189259         5.31347   

   Judging Score Interest Personality  
0       5.462224     Arts        ENTP  
1       3.765012  Unknown        INTP  
2       5.454442   Others        ESFP  
3       5.080477   Sports        ENFJ  
4       3.677984   Others        ISFP  


In [None]:
# Lihat info umum
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43744 entries, 0 to 43743
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 43744 non-null  float64
 1   Gender              43744 non-null  object 
 2   Education           43744 non-null  int64  
 3   Introversion Score  43744 non-null  float64
 4   Sensing Score       43744 non-null  float64
 5   Thinking Score      43744 non-null  float64
 6   Judging Score       43744 non-null  float64
 7   Interest            43744 non-null  object 
 8   Personality         43744 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 3.0+ MB
None


## cek semua nilai uniq di data bertipe objek

In [None]:
for col in df.select_dtypes(include=['object']).columns:
    print(f'Unique values for column {col}: {df[col].unique()}')


Unique values for column Gender: ['Female' 'Male']
Unique values for column Interest: ['Arts' 'Unknown' 'Others' 'Sports' 'Technology']
Unique values for column Personality: ['ENTP' 'INTP' 'ESFP' 'ENFJ' 'ISFP' 'ISFJ' 'ESTJ' 'INFP' 'ESTP' 'ENFP'
 'INTJ' 'ESFJ' 'ISTJ' 'INFJ' 'ISTP' 'ENTJ']


# Preprocessing

## missing value

In [None]:
df.isna().sum()

Unnamed: 0,0
Age,0
Gender,0
Education,0
Introversion Score,0
Sensing Score,0
Thinking Score,0
Judging Score,0
Interest,0
Personality,0


## duplicated

In [None]:
df.duplicated().sum()

np.int64(1028)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

np.int64(0)

## rubah tipe data age ke int




In [None]:
df['Age'] = df['Age'].astype(int)


In [None]:
# Ubah semua kolom bertipe object menjadi string
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42716 entries, 0 to 43743
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 42716 non-null  int64  
 1   Gender              42716 non-null  string 
 2   Education           42716 non-null  int64  
 3   Introversion Score  42716 non-null  float64
 4   Sensing Score       42716 non-null  float64
 5   Thinking Score      42716 non-null  float64
 6   Judging Score       42716 non-null  float64
 7   Interest            42716 non-null  string 
 8   Personality         42716 non-null  string 
dtypes: float64(4), int64(2), string(3)
memory usage: 3.3 MB


## normalisasi

In [None]:
from sklearn.preprocessing import MinMaxScaler

num_features = ['Age', 'Introversion Score', 'Sensing Score', 'Thinking Score', 'Judging Score']

# Inisialisasi dan terapkan MinMaxScaler
scaler = MinMaxScaler()
data_scaled = df.copy()
data_scaled[num_features] = scaler.fit_transform(df[num_features])

data_scaled.head()


Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
0,0.088235,Female,1,0.589207,0.21873,0.732371,0.546222,Arts,ENTP
1,0.176471,Female,1,0.248357,0.327034,0.806888,0.376501,Unknown,INTP
2,0.235294,Female,1,0.702911,0.659874,0.416463,0.545444,Others,ESFP
3,0.352941,Male,0,0.546523,0.426287,0.282471,0.508048,Sports,ENFJ
4,0.382353,Female,0,0.359797,0.63131,0.531344,0.367798,Others,ISFP


In [None]:
categorical_cols=['Gender', 'Interest']


In [None]:
gender_encoder = LabelEncoder()
df['Gender'] = gender_encoder.fit_transform(df['Gender'])

interest_encoder = LabelEncoder()
df['Interest'] = interest_encoder.fit_transform(df['Interest'])

target_encoder = LabelEncoder()
df['Personality'] = target_encoder.fit_transform(df['Personality'])


In [None]:
df

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
0,21,0,1,5.89208,2.144395,7.32363,5.462224,0,3
1,24,0,1,2.48366,3.206188,8.06876,3.765012,4,11
2,26,0,1,7.02910,6.469302,4.16472,5.454442,1,5
3,30,1,0,5.46525,4.179244,2.82487,5.080477,2,0
4,31,0,0,3.59804,6.189259,5.31347,3.677984,1,13
...,...,...,...,...,...,...,...,...,...
43738,37,0,0,4.15540,6.318498,2.76299,5.458352,0,5
43739,26,1,1,8.88656,5.118399,8.48784,5.331942,0,3
43740,29,1,0,2.71470,6.851956,0.25519,3.623678,4,13
43742,23,1,0,7.23553,4.876780,2.41671,4.654016,0,1


## randomforest tanpa normalisasi

In [None]:
# Pisahkan fitur dan target
X = df.drop('Personality', axis=1)
y = df['Personality']


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestClassifier()

# Training
model.fit(X_train, y_train)

# Evaluasi
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8959503745318352


## simpan model

In [None]:
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(gender_encoder, 'gender_encoder.pkl')
joblib.dump(interest_encoder, 'interest_encoder.pkl')
joblib.dump(target_encoder, 'target_encoder.pkl')

['target_encoder.pkl']

## test sistem

In [None]:
import numpy as np
import pandas as pd
import joblib

# Load model dan encoder
model = joblib.load('random_forest_model.pkl')
gender_encoder = joblib.load('gender_encoder.pkl')
interest_encoder = joblib.load('interest_encoder.pkl')
target_encoder = joblib.load('target_encoder.pkl')

# Input user
input_data = (21, 'Male', 1, 2.71470, 4.876780, 7.32363, 5.462224, 'Sports')

# Buat DataFrame
input_df = pd.DataFrame([input_data], columns=[
    'Age', 'Gender', 'Education',
    'Introversion Score', 'Sensing Score',
    'Thinking Score', 'Judging Score', 'Interest'
])

# Transform kolom kategorikal dengan encoder yang sudah dilatih
input_df['Gender'] = gender_encoder.transform(input_df['Gender'])
input_df['Interest'] = interest_encoder.transform(input_df['Interest'])

# Prediksi
prediction = model.predict(input_df)

# Decode label hasil prediksi
decoded_prediction = target_encoder.inverse_transform(prediction)

# Tampilkan hasil
print("Hasil prediksi:", decoded_prediction[0])

Hasil prediksi: INTP


In [None]:
from google.colab import files

In [None]:
files.download("random_forest_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>