# Logistic Regression Implementation from Scratch

## Stage 1: Exploratory Data Analysis

We'll start by exploring and visualizing our data before implementation.

In [None]:
# Essential imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)

# Configure plot styling
plt.style.use('seaborn')
sns.set_palette('husl')

### 1.1 Data Generation

Let's create synthetic data for binary classification to understand the problem better.

In [None]:
def generate_data(n_samples=1000, n_features=2, n_classes=2, n_clusters_per_class=1):
    """Generate synthetic data for binary classification."""
    X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_classes=n_classes,
        n_clusters_per_class=n_clusters_per_class,
        random_state=42
    )
    return X, y

# Generate data
X, y = generate_data()

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### 1.2 Data Visualization

In [None]:
def plot_data_distribution(X, y):
    """Visualize the distribution of features and classes."""
    plt.figure(figsize=(12, 5))
    
    # Scatter plot of features
    plt.subplot(121)
    scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', alpha=0.6)
    plt.colorbar(scatter)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('Feature Distribution by Class')
    
    # Class distribution
    plt.subplot(122)
    sns.countplot(y=y)
    plt.xlabel('Class')
    plt.title('Class Distribution')
    
    plt.tight_layout()
    plt.show()

# Visualize the data
plot_data_distribution(X, y)

### 1.3 Feature Analysis

In [None]:
def analyze_features(X, y):
    """Analyze feature distributions and relationships."""
    plt.figure(figsize=(12, 5))
    
    # Feature distributions
    plt.subplot(121)
    for i in range(X.shape[1]):
        sns.kdeplot(data=X[:, i], label=f'Feature {i+1}')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.title('Feature Distributions')
    plt.legend()
    
    # Correlation between features
    plt.subplot(122)
    sns.heatmap(
        np.corrcoef(X.T),
        annot=True,
        cmap='coolwarm',
        vmin=-1,
        vmax=1
    )
    plt.title('Feature Correlation Matrix')
    
    plt.tight_layout()
    plt.show()

# Analyze features
analyze_features(X, y)

### 1.4 Data Statistics

In [None]:
def print_data_stats(X, y):
    """Print basic statistics about the dataset."""
    print("Dataset Statistics:")
    print("-" * 20)
    print(f"Number of samples: {X.shape[0]}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of classes: {len(np.unique(y))}")
    print(f"Class distribution:\n{pd.Series(y).value_counts().to_dict()}")
    
    # Feature statistics
    feature_stats = pd.DataFrame({
        'Mean': X.mean(axis=0),
        'Std': X.std(axis=0),
        'Min': X.min(axis=0),
        'Max': X.max(axis=0)
    }, index=[f'Feature {i+1}' for i in range(X.shape[1])])
    
    print("\nFeature Statistics:")
    print("-" * 20)
    print(feature_stats)

# Print dataset statistics
print_data_stats(X, y)

### 1.5 Data Preprocessing Check

In [None]:
def check_preprocessing_needs(X):
    """Check if data needs preprocessing."""
    checks = {
        'Missing values': np.any(np.isnan(X)),
        'Infinity values': np.any(np.isinf(X)),
        'Scale differences': np.any(np.abs(X.std(axis=0)) > 10),
        'Zero variance': np.any(X.std(axis=0) == 0)
    }
    
    print("Preprocessing Checks:")
    print("-" * 20)
    for check, result in checks.items():
        print(f"{check}: {'Needed' if result else 'Not needed'}")

# Check preprocessing needs
check_preprocessing_needs(X)

### 1.6 Decision Boundary Visualization (Preview)

In [None]:
def plot_decision_regions_preview(X, y):
    """Plot decision regions using a simple threshold."""
    plt.figure(figsize=(10, 6))
    
    # Create a mesh grid
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    
    # Simple threshold for visualization
    Z = (xx + yy > 0).astype(int)
    
    # Plot decision regions
    plt.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', alpha=0.8)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('Preview of Decision Regions (Simple Threshold)')
    plt.colorbar()
    plt.show()

# Plot preview of decision regions
plot_decision_regions_preview(X, y)

### Summary of EDA Findings

1. **Data Structure**:
   - Binary classification problem
   - Two features for visualization purposes
   - Balanced class distribution

2. **Feature Characteristics**:
   - No missing or infinite values
   - Features are on similar scales
   - Non-zero variance in all features

3. **Visualization Insights**:
   - Classes are somewhat separable
   - Non-linear decision boundary might be needed
   - Some overlap between classes

4. **Next Steps**:
   - Implement logistic regression
   - Consider feature scaling for optimization
   - Evaluate model performance
   - Compare with non-linear classifiers