In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score

# Load data
df = pd.read_csv("counterfeit_products_renamed.csv")

# --- Data Cleaning ---
# Ensure correct data types for target
df['fraud_indicator'] = df['fraud_indicator'].map({True: 1, False: 0, 'True': 1, 'False': 0})

# Impute missing values: median for numeric, most frequent for categorical
num_cols = df.select_dtypes(include=[np.number]).columns.drop('fraud_indicator')
cat_cols = df.select_dtypes(include=['object', 'bool']).columns

for col in num_cols:
    df[col] = SimpleImputer(strategy='median').fit_transform(df[[col]])
for col in cat_cols:
    df[col] = SimpleImputer(strategy='most_frequent').fit_transform(df[[col]])

# Outlier capping using IQR
for col in num_cols:
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    df[col] = np.clip(df[col], lower, upper)

# --- Feature Engineering ---
# Date features
df['post_date'] = pd.to_datetime(df['post_timestamp'], errors='coerce')
df['post_month'] = df['post_date'].dt.month.fillna(0)
df['post_year'] = df['post_date'].dt.year.fillna(0)

# Label encode relevant categorical columns
label_cols = ['product_type', 'manufacturer', 'vendor_code', 'vendor_nation', 'dispatch_loc']
for col in label_cols:
    if col in df:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# --- Feature Selection ---
# Remove non-features
X_all = df.drop(['fraud_indicator', 'item_ref', 'post_timestamp', 'post_date'], axis=1, errors='ignore')
y = df['fraud_indicator']

rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_all, y)
importances = pd.Series(rf.feature_importances_, index=X_all.columns).sort_values(ascending=False)
top_features = importances.head(10).index.tolist()

# --- Data Split ---
X = df[top_features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# --- Baseline Models ---
logreg = LogisticRegression(random_state=42)
rf_base = RandomForestClassifier(n_estimators=100, random_state=42)
logreg.fit(X_train, y_train)
rf_base.fit(X_train, y_train)

# --- Validation Results ---
models = {'Logistic Regression': logreg, 'Random Forest': rf_base}
for name, model in models.items():
    y_pred = model.predict(X_val)
    print(f'== {name} ==')
    print('AUC:', roc_auc_score(y_val, y_pred))
    print('Accuracy:', accuracy_score(y_val, y_pred))
    print('F1:', f1_score(y_val, y_pred))
    print('Classification Report:')
    print(classification_report(y_val, y_pred))

# --- Cross-Validation ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    print(f'== CV Results: {name} ==')
    print('AUC CV:', cross_val_score(model, X_scaled, y, cv=cv, scoring='roc_auc').mean())
    print('Accuracy CV:', cross_val_score(model, X_scaled, y, cv=cv, scoring='accuracy').mean())
    print('F1 CV:', cross_val_score(model, X_scaled, y, cv=cv, scoring='f1').mean())


ValueError: 2