In [None]:
# Logistic Regression with Synthetic Dataset (5000+ records, three-way split)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

# 1. Generate dataset (5000+ records, 2 features, binary label)
np.random.seed(42)
n = 5000

X1 = np.random.normal(0, 1, n)
X2 = np.random.normal(0, 1, n)

# logistic relationship
w1, w2, bias = 2.0, -1.5, 0.5
logits = w1*X1 + w2*X2 + bias
probs = 1 / (1 + np.exp(-logits))

Y = np.random.binomial(1, probs)

df = pd.DataFrame({"X1": X1, "X2": X2, "Y": Y})
df.to_csv("logistic_dataset.csv", index=False)
print("✅ Dataset saved as logistic_dataset.csv with", len(df), "rows")

# 2. Split dataset (train, validation, test)
X = df[["X1","X2"]]
y = df["Y"]

# first split into train+val and test (80/20)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# then split train and validation (75/25 of 80 → 60% train, 20% val, 20% test)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")

# 3. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 4. Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

# 5. Evaluate on validation set (to simulate hyperparameter tuning)
y_val_pred = model.predict(X_val_scaled)
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Report:\n", classification_report(y_val, y_val_pred))

# 6. Final evaluation on test set
y_test_pred = model.predict(X_test_scaled)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Report:\n", classification_report(y_test, y_test_pred))

# 7. Visualization of test predictions
plt.scatter(X_test["X1"], X_test["X2"], c=y_test_pred, cmap="coolwarm", alpha=0.6)
plt.title("Prediction results (Test set)")
plt.xlabel("X1")
plt.ylabel("X2")
plt.show()

# 8. Predict on new data
new_data = np.array([[0.5, -1.2], [2.0, 1.0]])  # two new samples
new_data_scaled = scaler.transform(new_data)
preds = model.predict(new_data_scaled)
probs = model.predict_proba(new_data_scaled)

print("\nNew Predictions:")
for i, (x, pred, prob) in enumerate(zip(new_data, preds, probs)):
    print(f"Sample {i+1} (X1={x[0]}, X2={x[1]}): Pred={pred}, Prob={prob[1]:.4f}")
