# **Used Library**

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from category_encoders import TargetEncoder
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# **Load Dataset**

In [3]:
train = pd.read_csv("copparisk/train.csv")
target = pd.read_csv("copparisk/target.csv")
test = pd.read_csv("copparisk/test.csv")
submission = pd.read_csv("copparisk/submission_format.csv")

X = train.copy()
y = target["coppaRisk"]
X_test = test.drop(columns=["ID"])

# **Data cleaning & Feature Engineering**

In [4]:
def clean_and_engineer(df):
    df = df.copy()

    # Missing sebagai NaN
    df['developerCountry'] = df['developerCountry'].replace([
        "ADDRESS NOT LISTED IN PLAYSTORE", 
        "CANNOT IDENTIFY COUNTRY"], np.nan)

    # Parse downloads
    def parse_downloads(x):
        if isinstance(x, str) and "-" in x:
            try:
                return int(x.split("-")[1].strip().replace(" ", ""))
            except:
                return np.nan
        try:
            return int(x)
        except:
            return np.nan
    df['downloads'] = df['downloads'].apply(parse_downloads)

    # Log transform
    df['userRatingCount'] = pd.to_numeric(df['userRatingCount'], errors='coerce')
    df['downloads_log'] = np.log1p(df['downloads'])
    df['userRatingCount_log'] = np.log1p(df['userRatingCount'])

    # Rasio dan interaksi
    df['rating_per_download'] = df['userRatingCount'] / (df['downloads'] + 1)
    df['country_genre'] = df['developerCountry'].astype(str) + "_" + df['primaryGenreName'].astype(str)
    df["rating_x_brand_safety"] = df["averageUserRating"] * df["appContentBrandSafetyRating"].map({'low': 0, 'medium': 1, 'high': 2})
    df["log_adspent"] = np.log1p(df["adSpent"])
    df["is_old_app"] = (df["appAge"] > 365).astype(int)

    # Map rating teks ke ordinal
    rating_map = {'low': 0, 'medium': 1, 'high': 2}
    for col in ['hasTermsOfServiceLinkRating', 'appContentBrandSafetyRating', 
                'appDescriptionBrandSafetyRating', 'mfaRating']:
        df[col] = df[col].map(rating_map)

    return df

X = clean_and_engineer(X)
X_test = clean_and_engineer(X_test)

# **Imputate the Missing Values**

In [5]:
numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()

num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

X[numeric_cols] = pd.DataFrame(num_imputer.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)
X_test[numeric_cols] = pd.DataFrame(num_imputer.transform(X_test[numeric_cols]), columns=numeric_cols, index=X_test.index)

X[categorical_cols] = pd.DataFrame(cat_imputer.fit_transform(X[categorical_cols]), columns=categorical_cols, index=X.index)
X_test[categorical_cols] = pd.DataFrame(cat_imputer.transform(X_test[categorical_cols]), columns=categorical_cols, index=X_test.index)

# **Target Encoding**

In [6]:
encoder = TargetEncoder()
X = encoder.fit_transform(X, y)
X_test = encoder.transform(X_test)

# **Model Training**

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Inisialisasi model
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
    X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
    
    rf_model.fit(X_train_fold, y_train_fold)
    
    val_pred = rf_model.predict_proba(X_val_fold)[:, 1]
    test_pred = rf_model.predict_proba(X_test)[:, 1]
    
    oof_preds[val_idx] = val_pred
    test_preds += test_pred / skf.n_splits
    
    fold_auc = roc_auc_score(y_val_fold, val_pred)
    print(f"Fold {fold+1} AUC: {fold_auc:.5f}")

# Evaluasi keseluruhan
overall_auc = roc_auc_score(y, oof_preds)
print(f"\nOverall OOF AUC: {overall_auc:.5f}")


Fold 1 AUC: 0.88704
Fold 2 AUC: 0.91018
Fold 3 AUC: 0.88621
Fold 4 AUC: 0.87673
Fold 5 AUC: 0.87512

Overall OOF AUC: 0.88679


# **Threshold Optimization**

In [8]:
# AUC Score
auc_score = roc_auc_score(y, oof_preds)
print(f"OOF AUC Score: {auc_score:.5f}")

# Cari Threshold terbaik
fpr, tpr, thresholds = roc_curve(y, oof_preds)
best_threshold = thresholds[np.argmax(tpr - fpr)]
print(f"Best Threshold: {best_threshold:.4f}")

OOF AUC Score: 0.88679
Best Threshold: 0.0550


# **Create the Submission File**

In [9]:
submission["coppaRisk"] = (test_preds > best_thresh).astype(bool)
submission.to_csv("submission.csv", index=False)

NameError: name 'best_thresh' is not defined