In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

np.random.seed(123)  # Set random seed for reproducibility

# Generate independent variables (features)
n_samples = 10000
X = np.random.randn(n_samples, 2)  # Two features

true_coefficients = np.array([2.0, -1.5])

# Compute logits and probabilities
logits = X @ true_coefficients  # Linear combination of features and coefficients
probs = 1 / (1 + np.exp(-logits))  # Apply logistic function

# Generate binary outcomes (labels)
y = np.random.binomial(1, probs)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = LogisticRegression(fit_intercept=False)  # No intercept for simplicity
model.fit(X_train, y_train)

estimated_coefficients = model.coef_[0]
print("True Coefficients:", true_coefficients)
print("Estimated Coefficients:", estimated_coefficients)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred, target_names=["Class 0", "Class 1"])
print("\nClassification Report:")
print(report)


True Coefficients: [ 2.  -1.5]
Estimated Coefficients: [ 2.01128805 -1.53752281]

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.80      0.80      0.80      1222
     Class 1       0.81      0.80      0.81      1278

    accuracy                           0.80      2500
   macro avg       0.80      0.80      0.80      2500
weighted avg       0.80      0.80      0.80      2500



In [None]:
import numpy as np
import pandas as pd
from scipy.special import expit  # Sigmoid (inverse logit) function
from scipy.optimize import root_scalar

# Set parameters
n_samples = 10000         # Total number of samples
n_features = 5          # Total number of features
desired_ratio = 0.1     # Desired spam ratio (5% positives)
#np.random.seed(42)

# Step 1: Generate the feature matrix X from a standard normal distribution.
X = np.random.randn(n_samples, n_features)

# Step 2: Generate beta coefficients from a normal distribution.
beta = np.random.randn(n_features)

# Step 3: Calculate the linear predictor: z = X * beta.
z = np.dot(X, beta)

# Step 4: Calibrate an intercept so that the average probability is about desired_ratio.
def calibrate_intercept(c, z, desired):
    """
    Given z and a candidate intercept c,
    returns the difference between the average probability and the desired ratio.
    """
    p = expit(z + c)
    return np.mean(p) - desired

# Use a root-finding method (bisection) to find c so that mean(expit(z+c)) = desired_ratio.
# We choose a bracket that we think will contain the root.
sol = root_scalar(lambda c: calibrate_intercept(c, z, desired_ratio), bracket=[-20, 0], method='bisect')
c = sol.root

# Adjust the linear predictor.
z_adjusted = z + c
p = expit(z_adjusted)

# Print the calibrated intercept and mean probability.
print("Calibrated intercept:", c)
print("Mean probability after calibration:", np.mean(p))

# Step 5: Draw binary target values from a Bernoulli distribution using the calibrated probabilities.
y = np.random.binomial(n=1, p=p, size=n_samples)

# Check the achieved spam ratio.
achieved_ratio = np.mean(y)
print("Achieved positive ratio from Bernoulli sampling:", achieved_ratio)

# Optional: Create DataFrames for easier viewing.
feature_columns = [f"X{i}" for i in range(1, n_features + 1)]
X = pd.DataFrame(X, columns=feature_columns)
y = pd.Series(y, name='target')

print("\nFirst 5 rows of features:")
print(X.head())
print("\nFirst 10 target values:")
print(y.head(10))
y.value_counts(normalize=True)