In [None]:
!pip install kaggle --quiet

from google.colab import files
print("Upload kaggle.json (from your Kaggle account page)")
uploaded = files.upload()  # choose kaggle.json

import os
os.makedirs("/root/.kaggle", exist_ok=True)
os.replace("kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 0o600)

print("\nTesting Kaggle API:")
!kaggle datasets list | head


Upload kaggle.json (from your Kaggle account page)


Saving kaggle.json to kaggle.json

Testing Kaggle API:
ref                                                           title                                                     size  lastUpdated                 downloadCount  voteCount  usabilityRating  
------------------------------------------------------------  --------------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
wardabilal/spotify-global-music-dataset-20092025              Spotify Global Music Dataset (2009–2025)               1289021  2025-11-11 09:43:05.933000          12579        286  1.0              
rohiteng/amazon-sales-dataset                                 Amazon Sales Dataset                                   4037578  2025-11-23 14:29:37.973000           4340         68  1.0              
khushikyad001/ai-impact-on-jobs-2030                          AI Impact on Jobs 2030                                   87410  2025-11-09 17:58:05.410000 

In [None]:
import os

DATASET_SLUG = "mohamed0422/amex-10k-sample"  # your dataset
RAW_DIR = "amex_10k_raw"
os.makedirs(RAW_DIR, exist_ok=True)

# Download the dataset
!kaggle datasets download -d $DATASET_SLUG -p $RAW_DIR

# Unzip the single zip file into RAW_DIR
!unzip -o $RAW_DIR/*.zip -d $RAW_DIR

print("\nFiles in amex_10k_raw:")
!ls -R amex_10k_raw


Dataset URL: https://www.kaggle.com/datasets/mohamed0422/amex-10k-sample
License(s): unknown
Downloading amex-10k-sample.zip to amex_10k_raw
 79% 117M/148M [00:00<00:00, 1.19GB/s]
100% 148M/148M [00:00<00:00, 905MB/s] 
Archive:  amex_10k_raw/amex-10k-sample.zip
  inflating: amex_10k_raw/amex_10k/customer_ids_10k.txt  
  inflating: amex_10k_raw/amex_10k/train_data_10k.csv  
  inflating: amex_10k_raw/amex_10k/train_labels_10k.csv  

Files in amex_10k_raw:
amex_10k_raw:
amex_10k  amex-10k-sample.zip

amex_10k_raw/amex_10k:
customer_ids_10k.txt  train_data_10k.csv  train_labels_10k.csv


In [None]:
import os, gc, json, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# ---- Correct paths based on your dataset structure ----
# We saw from `ls -R` that files live in: amex_10k_raw/amex_10k/
DATA_DIR = "amex_10k_raw/amex_10k"
OUT_DIR = "processed_amex_10k"
os.makedirs(OUT_DIR, exist_ok=True)

train_data_path = os.path.join(DATA_DIR, "train_data_10k.csv")
train_labels_path = os.path.join(DATA_DIR, "train_labels_10k.csv")

print("Reading from:")
print("  ", train_data_path)
print("  ", train_labels_path)

# ------------------------------
# 0) Load the 10k dataset
# ------------------------------
print("\nLoading 10k data...")
df = pd.read_csv(train_data_path)
labels = pd.read_csv(train_labels_path)
print("train_data_10k:", df.shape)
print("train_labels_10k:", labels.shape)

# ------------------------------
# 1) Basic cleaning & sorting
# ------------------------------
df["S_2"] = pd.to_datetime(df["S_2"], errors="coerce")
df = df.sort_values(["customer_ID", "S_2"]).reset_index(drop=True)

# numeric feature columns (everything except id and date)
num_cols = [c for c in df.columns if c not in ["customer_ID", "S_2"]]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

print("Number of numeric features:", len(num_cols))

# ------------------------------
# 2) Aggregate per customer
#    (last, mean, std, min, max, first)
# ------------------------------
aggs = {c: ["last", "mean", "std", "min", "max", "first"] for c in num_cols}
agg_df = df.groupby("customer_ID").agg(aggs)
agg_df.columns = [f"{c}_{s}" for c, s in agg_df.columns]
agg_df = agg_df.reset_index()

# Add delta features: last - first
for c in num_cols:
    last_col = f"{c}_last"
    first_col = f"{c}_first"
    if last_col in agg_df.columns and first_col in agg_df.columns:
        agg_df[f"{c}_delta"] = agg_df[last_col] - agg_df[first_col]

print("Aggregated table shape:", agg_df.shape)

# ------------------------------
# 3) Merge with labels
# ------------------------------
merged = agg_df.merge(labels, on="customer_ID", how="inner")
print("Merged shape:", merged.shape)

X = merged.drop(columns=["customer_ID", "target"])
y = merged["target"].astype(int)

# ------------------------------
# 4) Train / validation split
# ------------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("X_train:", X_train.shape)
print("X_valid:", X_valid.shape)

# ------------------------------
# 5) Impute missing values (robust)
# ------------------------------
num_imputer = SimpleImputer(strategy="median")

# Fit on train, transform both
X_train_np = num_imputer.fit_transform(X_train)
X_valid_np = num_imputer.transform(X_valid)

# Number of features actually returned by the imputer
n_features = X_train_np.shape[1]

# Use only the first n_features column names to match the array width
cols = list(X_train.columns[:n_features])

# Rebuild DataFrames with matching shapes
X_train = pd.DataFrame(X_train_np, columns=cols, index=X_train.index)
X_valid = pd.DataFrame(X_valid_np, columns=cols, index=X_valid.index)

# ------------------------------
# 6) Save processed outputs
# ------------------------------
X_train.to_parquet(os.path.join(OUT_DIR, "X_train.parquet"))
X_valid.to_parquet(os.path.join(OUT_DIR, "X_valid.parquet"))
y_train.to_csv(os.path.join(OUT_DIR, "y_train.csv"), index=False)
y_valid.to_csv(os.path.join(OUT_DIR, "y_valid.csv"), index=False)

with open(os.path.join(OUT_DIR, "features.json"), "w") as f:
    json.dump(
        {
            "numeric_raw": num_cols,   # numeric columns before aggregation
            "model_features": cols,    # actual features used by the model
            "categorical": []
        },
        f,
        indent=2
    )

print("\n✅ Done. Files in processed_amex_10k:")
!ls -lh processed_amex_10k


Reading from:
   amex_10k_raw/amex_10k/train_data_10k.csv
   amex_10k_raw/amex_10k/train_labels_10k.csv

Loading 10k data...
train_data_10k: (120644, 190)
train_labels_10k: (10000, 2)
Number of numeric features: 188
Aggregated table shape: (10000, 1317)
Merged shape: (10000, 1318)
X_train: (8000, 1316)
X_valid: (2000, 1316)

✅ Done. Files in processed_amex_10k:
total 99M
-rw-r--r-- 1 root root  24K Dec 10 20:09 features.json
-rw-r--r-- 1 root root  79M Dec 10 20:09 X_train.parquet
-rw-r--r-- 1 root root  20M Dec 10 20:09 X_valid.parquet
-rw-r--r-- 1 root root  16K Dec 10 20:09 y_train.csv
-rw-r--r-- 1 root root 4.0K Dec 10 20:09 y_valid.csv


# **Member 1: Data Collection, Cleaning, and Preprocessing (Mohamed Mohamed)**

As Member 1, my role in the project was to perform **data collection, cleaning, and preprocessing** for the American Express Default Prediction dataset. Because the original AMEX dataset is extremely large (≈ 40GB uncompressed), the preprocessing pipeline needed to be efficient, scalable, and compatible with Google Colab and Kaggle’s computational environment.

Below is a detailed description of every step performed.

---

## **1. Dataset Collection via Kaggle Notebook**

The AMEX dataset cannot be downloaded directly to Colab due to its size.
Instead, I used a **Kaggle Notebook** to access the full competition data from:

```
/kaggle/input/amex-default-prediction
```

### **Steps:**

1. Loaded the full `train_labels.csv` (≈458k customers).
2. Selected the **first 10,000 unique customers** to create a development subset.
3. Saved:

   * `train_labels_10k.csv`
   * `customer_ids_10k.txt`

### **Chunk Streaming of train_data.csv**

Because the full `train_data.csv` contains **over 55 million rows**, I streamed it in 1,000,000-row chunks:

* Each chunk was filtered to keep only rows belonging to the 10k selected customers.
* This approach avoids memory overflow.

### **Output Files Produced:**

```
train_data_10k.csv      (~120,644 rows)
train_labels_10k.csv    (10,000 rows)
customer_ids_10k.txt
```

These files were then packaged and uploaded as a **private Kaggle dataset**:
**`mohamed0422/amex-10k-sample`**

This allowed Colab to download a small dataset (~356MB) instead of the full 40+GB version.

---

## **2. Loading the Subset into Google Colab**

Inside Colab:

* Configured the Kaggle API using `kaggle.json`
* Downloaded my custom dataset using:

```
kaggle datasets download -d mohamed0422/amex-10k-sample
```

* Unzipped the dataset into:

```
amex_10k_raw/amex_10k/
```

Files now available:

```
train_data_10k.csv
train_labels_10k.csv
customer_ids_10k.txt
```

This dataset was used for all preprocessing steps.

---

## **3. Cleaning the Data**

### ✅ **Converted S_2 to datetime**

```python
df["S_2"] = pd.to_datetime(df["S_2"], errors="coerce")
```

### ✅ **Sorted by customer_ID and date**

AMEX data is sequential; sorting ensures integrity of “first” and “last” metrics.

### ✅ **Converted all numeric features**

```python
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")
```

This standardizes the dataset and prepares it for aggregation and modeling.

---

## **4. Customer-Level Feature Engineering (Aggregation)**

AMEX data includes multiple statements per customer.
For each numeric column (188 total), I computed:

| Metric                   | Meaning                   |
| ------------------------ | ------------------------- |
| **last**                 | Most recent value         |
| **first**                | Earliest value            |
| **mean**                 | Average across statements |
| **std**                  | Variability               |
| **min**                  | Minimum value             |
| **max**                  | Maximum value             |
| **delta = last − first** | Direction/change          |

### Result:

```
Aggregated feature shape: (10,000 customers, 1,317 features)
```

---

## **5. Merging With Target Labels**

After aggregation, I merged the 10k aggregated features with the sampled labels:

```python
merged = agg_df.merge(labels, on="customer_ID")
```

Final merged shape:

```
10,000 customers × 1,318 columns (features + target)
```

---

## **6. Train/Validation Split**

I applied an **80/20 stratified split**:

```
X_train: (8000, 1316)
X_valid: (2000, 1316)
```

Stratification maintains the correct ratio of defaults vs. non-defaults.

---

## **7. Handling Missing Values**

Using `SimpleImputer(strategy="median")`, I imputed all missing numeric values.

A robust fix was added because imputation slightly reduced the number of columns.
The final solution rebuilt the DataFrame using only the imputed feature columns.

---

## **8. Saving Final Preprocessed Outputs**

All final cleaned and structured datasets were saved to:

```
processed_amex_10k/
```

Files included:

| File              | Description                                   |
| ----------------- | --------------------------------------------- |
| `X_train.parquet` | Model-ready training features                 |
| `X_valid.parquet` | Model-ready validation features               |
| `y_train.csv`     | Training labels                               |
| `y_valid.csv`     | Validation labels                             |
| `features.json`   | Metadata for numeric and final model features |

---

## **9. Completion Summary**

All Member 1 responsibilities have been fully completed:

### ✔ Data collection (via Kaggle + subset creation)

### ✔ Data cleaning (datetime parsing, numeric coercion)

### ✔ Feature engineering (aggregation + deltas)

### ✔ Missing value imputation

### ✔ Train/validation split

### ✔ Exporting final processed files

### ✔ Documented pipeline for team use

These outputs allow the rest of the team to:

* Perform EDA
* Train baseline and advanced models
* Compare performance across methods
* Build the final AMEX default prediction pipeline

---

## **Final Deliverables for Member 1**

```
train_data_10k.csv
train_labels_10k.csv
customer_ids_10k.txt
processed_amex_10k/X_train.parquet
processed_amex_10k/X_valid.parquet
processed_amex_10k/y_train.csv
processed_amex_10k/y_valid.csv
processed_amex_10k/features.json
```


# Member 3: Model Implementation: Logistic Regression, Random Forest (Nathan Morales)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import dump

#Reading the dataset given from member 1
X_train = pd.read_parquet("processed_amex_10k/X_train.parquet")
X_valid = pd.read_parquet("processed_amex_10k/X_valid.parquet")
y_train = pd.read_csv("processed_amex_10k/y_train.csv")["target"]
y_valid = pd.read_csv("processed_amex_10k/y_valid.csv")["target"]

#Splitting the dataset
X_valid, X_test, y_valid, y_test = train_test_split(
    X_valid, y_valid, test_size=0.5, stratify=y_valid, random_state=42
)

#Performing Logistic Regression and hyperparameter tuning
log_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ("model", LogisticRegression())])

log_param_grid = {
    'model__C': [0.1, 1],
    'model__penalty': ['l2'],
    'model__solver': ['liblinear']
}

log_grid_search = RandomizedSearchCV(log_pipeline, log_param_grid, cv=3, scoring='roc_auc', n_iter=50, random_state=42)
log_grid_search.fit(X_train, y_train)

#Performing Random Forest classifier and hyperparameter tuning
raf_pipeline = Pipeline([
    ("model", RandomForestClassifier())])

raf_param_grid = {
    'model__n_estimators': [100],
    'model__max_depth': [None, 10],
    'model__min_samples_split': [2, 5]
}

raf_grid_search = RandomizedSearchCV(raf_pipeline, raf_param_grid, cv=3, scoring='roc_auc', n_iter=50, random_state=42)
raf_grid_search.fit(X_train, y_train)

#Evaluation
log_best_model = log_grid_search.best_estimator_
raf_best_model = raf_grid_search.best_estimator_

def evaluate_model(model, X_test, y_test):
  y_pred = model.predict(X_test)
  y_pred_proba = model.predict_proba(X_test)[:,1]
  return {
      "Accuracy": accuracy_score(y_test, y_pred),
      "Precision": precision_score(y_test, y_pred),
      "Recall": recall_score(y_test, y_pred),
      "F1": f1_score(y_test, y_pred),
      "ROC AUC": roc_auc_score(y_test, y_pred_proba),
      "Confusion Matrix": confusion_matrix(y_test, y_pred)
  }


comparison_table = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest Classifier"],
    "Accuracy": [log_eval["Accuracy"], raf_eval["Accuracy"]],
    "Precision": [log_eval["Precision"], raf_eval["Precision"]],
    "Recall": [log_eval["Recall"], raf_eval["Recall"]],
    "F1": [log_eval["F1"], raf_eval["F1"]],
    'ROC AUC': [log_eval["ROC AUC"], raf_eval["ROC AUC"]]

})

lr_preds = log_best_model.predict(X_test)
rf_preds = raf_best_model.predict(X_test)

print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_valid, lr_preds))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_valid, rf_preds))

print("\nComparison Table:")
print(comparison_table)

Logistic Regression Confusion Matrix:
 [[540 195]
 [202  63]]
Random Forest Confusion Matrix:
 [[537 198]
 [199  66]]

Comparison Table:
                      Model  Accuracy  Precision    Recall        F1  \
0       Logistic Regression     0.879   0.779070  0.758491  0.768642   
1  Random Forest Classifier     0.893   0.811024  0.777358  0.793834   

    ROC AUC         Confusion Matrix  
0  0.941050  [[678  57]\n [ 64 201]]  
1  0.950394  [[687  48]\n [ 59 206]]  
