In [None]:
import numpy as np
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

: 

In [4]:
df = pl.read_csv("/kaggle/input/credit-card-fraud-detection-dataset/credit_card_fraud_10k.csv")
df.head()

NameError: name 'pl' is not defined

In [5]:
# Check missing values
df.null_count()

NameError: name 'df' is not defined

In [6]:
# Check data info
df.describe()

NameError: name 'df' is not defined

In [7]:
plt.figure()
sns.countplot(x="is_fraud", data=df.to_pandas())
plt.title("Fraud vs Non-Fraud Distribution")
plt.show()

NameError: name 'plt' is not defined

In [8]:
num_cols = [
    "amount", "transaction_hour", "device_trust_score",
    "velocity_last_24h", "cardholder_age"
]

for col in num_cols:
    plt.figure()
    sns.histplot(df.select(col).to_pandas(), x=col, kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

NameError: name 'plt' is not defined

In [9]:
# Night transaction flag
df = df.with_columns(
    (pl.col("transaction_hour").is_in([0, 1, 2, 3]).cast(pl.Int64)).alias("night_transaction")
)

# High amount flag
df = df.with_columns(
    (pl.col("amount") > 900).cast(pl.Int64).alias("high_amount")
)

NameError: name 'df' is not defined

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df = df.with_columns(
    pl.Series(
        "merchant_category",
        le.fit_transform(df["merchant_category"].to_list())
    )
)

NameError: name 'df' is not defined

In [None]:
X = df.drop("is_fraud")
y = df["is_fraud"].to_numpy()

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X.to_numpy(), y)

np.unique(y_resampled, return_counts=True)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_resampled, test_size=0.2, random_state=42
)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}


In [None]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    })

results_df = pl.DataFrame(results)
results_df

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
rf = models["Random Forest"]

feature_importance = pl.Series(
    rf.feature_importances_,
    name="importance"
)

fi_df = pl.DataFrame({
    "feature": df.drop("is_fraud").columns,
    "importance": feature_importance
}).sort("importance", descending=True)

plt.figure(figsize=(8,5))
plt.bar(fi_df["feature"].to_list(), fi_df["importance"].to_list())
plt.xticks(rotation=90)
plt.title("Feature Importance - Random Forest")
plt.show()