In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

# Create directories if they don't exist
os.makedirs("../models", exist_ok=True)
os.makedirs("../images", exist_ok=True)

# 1. Data Loading and Initial Exploration
df = pd.read_csv("../data/blood.csv")
print("Dataset shape:", df.shape)
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

# 2. Data Cleaning
# Drop duplicates
df = df.drop_duplicates()
print("\nDuplicate rows removed. New shape:", df.shape)

# Drop irrelevant columns
df.drop(columns=["Recency"], inplace=True, errors="ignore")

# Check target distribution
plt.figure(figsize=(8, 5))
df["Class"].value_counts().plot(kind="bar", color=["skyblue", "salmon"])
plt.title("Target Variable Distribution")
plt.xlabel("Risk Class")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.savefig("../images/target_distribution.png")
plt.show()

# 3. Feature Analysis
# Numerical features summary
print("\nNumerical features summary:")
print(df.describe())

# Correlation analysis
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.savefig("../images/correlation_heatmap.png")
plt.show()

# Pairplot for feature relationships
sns.pairplot(df, hue="Class", palette="Set2", diag_kind="kde")
plt.suptitle("Feature Relationships by Risk Class", y=1.02)
plt.savefig("../images/pairplot.png")
plt.show()

# 4. Data Preparation for Modeling
X = df.drop("Class", axis=1)
y = df["Class"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create pipeline
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("classifier", LogisticRegression(random_state=42))]
)

# 5. Model Training
pipeline.fit(X_train, y_train)

# Save model
joblib.dump(pipeline, "../models/logistic_regression_pipeline.pkl")

# 6. Model Evaluation
y_pred = pipeline.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    confusion_matrix(y_test, y_pred),
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Low Risk", "High Risk"],
    yticklabels=["Low Risk", "High Risk"],
)
plt.title("Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.tight_layout()
plt.savefig("../images/confusion_matrix.png")
plt.show()

# 7. Feature Importance Analysis
coefficients = pipeline.named_steps["classifier"].coef_[0]
features = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=features, y=coefficients, palette="viridis")
plt.title("Feature Importance (Logistic Regression Coefficients)")
plt.ylabel("Coefficient Value")
plt.xlabel("Features")
plt.tight_layout()
plt.savefig("../images/feature_importance.png")
plt.show()

print("\nAnalysis complete! Visualizations and model saved.")