# **Pratikum 1**

### **Langkah 1 - Import Library**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2, SelectKBest, RFE
from sklearn.metrics import accuracy_score, classification_report

### **Langkah 2 - Load Data dan Pengelompokan Data**

In [None]:
# Load Data
df = pd.read_csv("Titanic-Dataset.csv")

# Pisahkan Survived
y = df["Survived"].astype(int)
X = df.drop(columns=["Survived"])

# Buat list variabel numerik dan kategorikal
# Akan digunakan untuk proses seleksi fitur
# Name tidak akan digunakan karena tidak relevan
num_cols = ["Age", "SibSp", "Parch", "Fare"]
cat_cols = ["Pclass", "Sex", "Embarked"]

### **Langkah 3 - Ekstraksi Fitur**

In [None]:
# Ekstaksi Fitur dengan Pipeline

# Data Numerik
num_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Data Kategorikal
cat_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

**Langkah 3.1 - Feature Construction**

In [None]:
# Buat Fitur FamilySize
X["FamilySize"] = X["SibSp"].fillna(0) + X["Parch"].fillna(0) + 1

# Tambahkan FamilySize pada kelompok numerikal
preprocess = ColumnTransformer([
    ("num", num_tf, num_cols + ["FamilySize"]),
    ("cat", cat_tf, cat_cols),
])

### **Langkah 4 - Seleksi Fitur**

In [None]:
# Seleksi fitur dengan SelectKBest
# Fungsi tersebut akan menggunakan analisis variance
# Baca: https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection

from sklearn.feature_selection import f_classif
selector_filter = SelectKBest(score_func=f_classif, k=5)

# Buat pipeline final (INGAT INI HANYA PIPELINE, BELUM MEMPROSES DATA)
pipe_filter = Pipeline([
    ("prep", preprocess), # menjalankan pipeline preprocessing
    ("sel", selector_filter), # menjalankan pipeline seleksi fitur
    ("clf", LogisticRegression(max_iter=1000)) # uji dengan model sederhana -> Logistic Regression
])

### **Langkah 5 - Uji dengan Model**

In [None]:
# Lakukan pelatihan dan uji model
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

pipe_filter.fit(X_train, y_train)
pred = pipe_filter.predict(X_test)
print("=== Filter (ANOVA) + LR ===")
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

=== Filter (ANOVA) + LR ===
Accuracy: 0.776536312849162
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       110
           1       0.73      0.67      0.70        69

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.76       179
weighted avg       0.77      0.78      0.77       179



In [None]:
# 1) Nama fitur setelah preprocess
feat_names = pipe_filter.named_steps["prep"].get_feature_names_out()
print("Nama fitur:", feat_names)
print("\n")

# 2) Mask & skor fitur terpilih (SelectKBest)
sel = pipe_filter.named_steps["sel"]
mask = sel.get_support()
selected_names = feat_names[mask]
selected_scores = sel.scores_[mask]
top = sorted(zip(selected_names, selected_scores), key=lambda t: t[1], reverse=True)[:10]
print("Top fitur:", top)

Nama fitur: ['num__Age' 'num__SibSp' 'num__Parch' 'num__Fare' 'num__FamilySize'
 'cat__Pclass_1' 'cat__Pclass_2' 'cat__Pclass_3' 'cat__Sex_female'
 'cat__Sex_male' 'cat__Embarked_C' 'cat__Embarked_Q' 'cat__Embarked_S']


Top fitur: [('cat__Sex_female', np.float64(306.5932488951883)), ('cat__Sex_male', np.float64(306.59324889518797)), ('cat__Pclass_3', np.float64(80.33862734392042)), ('cat__Pclass_1', np.float64(73.99727564291717)), ('num__Fare', np.float64(58.31490728198491))]
