In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('/content/penguins.csv')

In [None]:
# Data Exploration
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# Visualisasi data
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.boxplot(x='species', y='bill_length_mm', data=df)
plt.title('Panjang Paruh berdasarkan Spesies')

plt.subplot(2,2,2)
sns.boxplot(x='species', y='bill_depth_mm', data=df)
plt.title('Kedalaman Paruh berdasarkan Spesies')

plt.subplot(2,2,3)
sns.boxplot(x='species', y='flipper_length_mm', data=df)
plt.title('Panjang Sirip berdasarkan Spesies')

plt.subplot(2,2,4)
sns.boxplot(x='species', y='body_mass_g', data=df)
plt.title('Massa Tubuh berdasarkan Spesies')

plt.tight_layout()
plt.show()

In [None]:
# Data Preprocessing
# Handle missing values
df = df.dropna()  # Bisa juga diisi dengan mean/median jika prefer

# Encode categorical variables
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['island'] = le.fit_transform(df['island'])

# Pisahkan fitur dan target
X = df.drop(['species'], axis=1)
y = df['species']

# Split data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modeling dengan Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)

# Evaluasi model
y_pred = rf.predict(X_test_scaled)

print("\nAkurasi Model:", accuracy_score(y_test, y_pred))
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Visualisasi feature importance
feature_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Feature Importance")
plt.show()



In [None]:
# Hyperparameter Tuning dengan GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

print("\nParameter Terbaik:", grid_search.best_params_)

In [None]:
# Evaluasi model setelah tuning
best_rf = grid_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test_scaled)

print("\nAkurasi Model Setelah Tuning:", accuracy_score(y_test, y_pred_tuned))
print("\nLaporan Klasifikasi Setelah Tuning:")
print(classification_report(y_test, y_pred_tuned))

# Simpan model untuk deployment (opsional)
import joblib
joblib.dump(best_rf, 'penguin_species_rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(le, 'label_encoder.pkl')