# Tugas Group Project 2

- Rafindra Nabiel Fawwaz
- Akhtar Zia Faizarobbi
- Naufal Maula Nabil
- Sinta Dewi Rahmawati

## Pemilahan Data

### Import Dataset

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv('app_train_cleaned_encoded.csv')
test_df = pd.read_csv('app_test_cleaned_encoded.csv')

In [None]:
train_df.shape
test_df.shape

### Splitting Data

In [None]:
X = train_df.drop(columns=['TARGET', 'SK_ID_CURR'])
y = train_df['TARGET']

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25, 
    stratify=y, 
    random_state=42
)

#### Statistik data

In [None]:
print("Jumlah data sebelum split:")
print(y.value_counts(normalize=True))
print()

print("Jumlah data setelah split:")
print("Train set:")
print(y_train.value_counts(normalize=True))
print()
print("Test set:")
print(y_test.value_counts(normalize=True))

#### Kfold

In [None]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

## Eksperimen Model Klasifikasi

### 1. KNN

### 2. Naive Bayes

### 3. Logistic Regression

### 4. SVM

### 5. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X, y)
y_pred = dt_model.predict(X_test)

#### Max Depth

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Define the range of max_depth values
max_depths = np.linspace(1, 32, 32, endpoint=True, dtype=int)

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each max_depth value
for depth in max_depths:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    
    # Predictions on training data
    train_pred = dt.predict(X_train)
    train_prob = dt.predict_proba(X_train)[:, 1]
    
    # Calculate metrics for training data
    train_results['accuracy'].append(accuracy_score(y_train, train_pred))
    train_results['precision'].append(precision_score(y_train, train_pred, zero_division=0))
    train_results['recall'].append(recall_score(y_train, train_pred, zero_division=0))
    train_results['f1'].append(f1_score(y_train, train_pred, zero_division=0))
    train_results['roc_auc'].append(roc_auc_score(y_train, train_prob))
    
    # Predictions on testing data
    test_pred = dt.predict(X_test)
    test_prob = dt.predict_proba(X_test)[:, 1]
    
    # Calculate metrics for testing data
    test_results['accuracy'].append(accuracy_score(y_test, test_pred))
    test_results['precision'].append(precision_score(y_test, test_pred, zero_division=0))
    test_results['recall'].append(recall_score(y_test, test_pred, zero_division=0))
    test_results['f1'].append(f1_score(y_test, test_pred, zero_division=0))
    test_results['roc_auc'].append(roc_auc_score(y_test, test_prob))

# Plot the results
plt.figure(figsize=(12, 8))

# Plot ROC AUC
plt.plot(max_depths, train_results['roc_auc'], 'b', label='Train ROC AUC')
plt.plot(max_depths, test_results['roc_auc'], 'r', label='Test ROC AUC')

# Plot Accuracy
plt.plot(max_depths, train_results['accuracy'], 'g--', label='Train Accuracy')
plt.plot(max_depths, test_results['accuracy'], 'y--', label='Test Accuracy')

# Plot Precision
plt.plot(max_depths, train_results['precision'], 'c-.', label='Train Precision')
plt.plot(max_depths, test_results['precision'], 'm-.', label='Test Precision')

# Plot Recall
plt.plot(max_depths, train_results['recall'], 'k:', label='Train Recall')
plt.plot(max_depths, test_results['recall'], 'orange', label='Test Recall')

# Plot F1-Score
plt.plot(max_depths, train_results['f1'], 'purple', label='Train F1-Score')
plt.plot(max_depths, test_results['f1'], 'brown', label='Test F1-Score')

# Add labels and legend
plt.xlabel('Tree Depth')
plt.ylabel('Score')
plt.title('Decision Tree Performance Metrics vs Tree Depth')
plt.legend(loc='best')
plt.grid()
plt.show()


In [None]:
# Define the range of max_depth values
max_depths = [14, 15]

# Initialize lists to store results
train_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}
test_results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': []}

# Loop through each max_depth value
for depth in max_depths:
    print(f"\nEvaluating max_depth={depth}")
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, y_train), 1):
        # Split the data for the current fold
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Train the model on the training fold
        dt.fit(X_tr, y_tr)
        
        # Predictions on training fold
        train_pred = dt.predict(X_tr)
        train_prob = dt.predict_proba(X_tr)[:, 1]
        
        # Calculate metrics for training fold
        train_accuracy = accuracy_score(y_tr, train_pred)
        train_precision = precision_score(y_tr, train_pred, zero_division=0)
        train_recall = recall_score(y_tr, train_pred, zero_division=0)
        train_f1 = f1_score(y_tr, train_pred, zero_division=0)
        train_roc_auc = roc_auc_score(y_tr, train_prob)
        
        # Store training metrics
        train_results['accuracy'].append(train_accuracy)
        train_results['precision'].append(train_precision)
        train_results['recall'].append(train_recall)
        train_results['f1'].append(train_f1)
        train_results['roc_auc'].append(train_roc_auc)
        
        # Predictions on validation fold
        val_pred = dt.predict(X_val)
        val_prob = dt.predict_proba(X_val)[:, 1]
        
        # Calculate metrics for validation fold
        val_accuracy = accuracy_score(y_val, val_pred)
        val_precision = precision_score(y_val, val_pred, zero_division=0)
        val_recall = recall_score(y_val, val_pred, zero_division=0)
        val_f1 = f1_score(y_val, val_pred, zero_division=0)
        val_roc_auc = roc_auc_score(y_val, val_prob)
        
        # Store validation metrics
        test_results['accuracy'].append(val_accuracy)
        test_results['precision'].append(val_precision)
        test_results['recall'].append(val_recall)
        test_results['f1'].append(val_f1)
        test_results['roc_auc'].append(val_roc_auc)
        
        # Print metrics for the current fold
        print(f"Fold {fold}")
        print(f"  Train Metrics: Accuracy={train_accuracy:.4f}, Precision={train_precision:.4f}, Recall={train_recall:.4f}, F1={train_f1:.4f}, ROC AUC={train_roc_auc:.4f}")
        print(f"  Test Metrics:  Accuracy={val_accuracy:.4f}, Precision={val_precision:.4f}, Recall={val_recall:.4f}, F1={val_f1:.4f}, ROC AUC={val_roc_auc:.4f}")

    # Calculate and print mean and standard deviation for each metric
    train_df = pd.DataFrame(train_results)
    test_df = pd.DataFrame(test_results)

    summary = {
        'Train Mean': train_df.mean(),
        'Train Std': train_df.std(),
        'Test Mean': test_df.mean(),
        'Test Std': test_df.std()
    }

    summary_df = pd.DataFrame(summary)
    print("\nSummary of 10-Fold Cross-Validation Metrics:")
    print(summary_df.round(4))


### 6.  Back Propagation Neural Network (BPNN)