In [None]:
# Logistic Regression with Synthetic Dataset (5000+ records, K-Fold CV)

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# 1. Generate dataset (5000+ records, 2 features, binary label)
np.random.seed(42)
n = 5000

X1 = np.random.normal(0, 1, n)
X2 = np.random.normal(0, 1, n)

# logistic relationship
w1, w2, bias = 2.0, -1.5, 0.5
logits = w1*X1 + w2*X2 + bias
probs = 1 / (1 + np.exp(-logits))

Y = np.random.binomial(1, probs)

df = pd.DataFrame({"X1": X1, "X2": X2, "Y": Y})
df.to_csv("logistic_dataset.csv", index=False)
print("✅ Dataset saved as logistic_dataset.csv with", len(df), "rows")

# 2. Features and labels
X = df[["X1","X2"]]
y = df["Y"]

# 3. Define pipeline (scaling + logistic regression)
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000))
])

# 4. Define K-Fold cross-validation (Stratified to keep class balance)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 5. Evaluate with cross_val_score
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring="accuracy")

print("\nK-Fold Cross Validation Results:")
print("Scores:", scores)
print("Mean Accuracy: {:.4f}".format(scores.mean()))
print("Std Dev: {:.4f}".format(scores.std()))

# 6. Train final model on entire dataset
pipeline.fit(X, y)

# 7. Predict on new data
new_data = np.array([[0.5, -1.2], [2.0, 1.0]])
preds = pipeline.predict(new_data)
probs = pipeline.predict_proba(new_data)

print("\nNew Predictions:")
for i, (x, pred, prob) in enumerate(zip(new_data, preds, probs)):
    print(f"Sample {i+1} (X1={x[0]}, X2={x[1]}): Pred={pred}, Prob={prob[1]:.4f}")

# 8. Visualization (optional, full dataset predictions)
plt.scatter(X["X1"], X["X2"], c=pipeline.predict(X), cmap="coolwarm", alpha=0.6)
plt.title("Prediction results (K-Fold trained model)")
plt.xlabel("X1")
plt.ylabel("X2")
plt.show()
