
# Linear & Logistic Regression with scikit-learn  
*Library‑level deep dive with syntax, parameters, examples*


In [None]:

import numpy as np
import pandas as pd

from sklearn.datasets import make_regression, make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (
    r2_score,
    accuracy_score,
    classification_report,
    confusion_matrix
)


## Linear Regression in scikit-learn

In [None]:

# Example data (synthetic)
X_reg, y_reg = make_regression(
    n_samples=200, n_features=5, noise=0.5, random_state=0
)

# Split
Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    X_reg, y_reg, random_state=0
)

# Instantiate & fit
lr = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=None)
lr.fit(Xr_train, yr_train)

# Predict & evaluate
y_reg_pred = lr.predict(Xr_test)
print("R² on test:", r2_score(yr_test, y_reg_pred))
print("Coefficients:", lr.coef_)
print("Intercept:", lr.intercept_)


In [None]:

# Using a pipeline (with scaling)
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('linreg', LinearRegression())
], memory=None)

pipe_lr.fit(Xr_train, yr_train)
y_reg_pred2 = pipe_lr.predict(Xr_test)

print("R² via pipeline:", r2_score(yr_test, y_reg_pred2))
print("Coefficients (pipeline):", pipe_lr.named_steps['linreg'].coef_)
print("Intercept (pipeline):", pipe_lr.named_steps['linreg'].intercept_)


## Logistic Regression in scikit-learn

In [None]:

# Synthetic classification data
X_clf, y_clf = make_classification(
    n_samples=200, n_features=5, n_informative=3,
    n_redundant=1, n_classes=2, random_state=0
)

# Split
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_clf, y_clf, random_state=0
)

# Instantiate & fit
clf = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='lbfgs',
    max_iter=100,
    random_state=0
)
clf.fit(Xc_train, yc_train)

# Predictions & probabilities
y_clf_pred = clf.predict(Xc_test)
y_clf_proba = clf.predict_proba(Xc_test)
scores = clf.decision_function(Xc_test)

print("Accuracy:", accuracy_score(yc_test, y_clf_pred))
print("Classification report:\n", classification_report(yc_test, y_clf_pred))
print("Confusion matrix:\n", confusion_matrix(yc_test, y_clf_pred))
print("First 5 predicted probabilities:\n", y_clf_proba[:5])
print("Coefficients:", clf.coef_)
print("Intercept:", clf.intercept_)


In [None]:

# Logistic regression with pipeline & hyperparameter tuning
pipe_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='saga', max_iter=500))
])

param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__penalty': ['l1', 'l2', 'elasticnet'],
    'logreg__l1_ratio': [0.0, 0.5, 1.0]
}

grid = GridSearchCV(
    estimator=pipe_clf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    refit=True
)

grid.fit(Xc_train, yc_train)

best = grid.best_estimator_
print("Best hyperparameters:", grid.best_params_)
print("Test set accuracy:", best.score(Xc_test, yc_test))

y_best_pred = best.predict(Xc_test)
print("Classification report for best model:\n",
      classification_report(yc_test, y_best_pred))



## Exercises
1. Try different `fit_intercept` values for `LinearRegression` and compare results.  
2. Tune `C` and `penalty` for logistic regression and observe coefficient sparsity.  
3. Create a multiclass dataset and test `multi_class='multinomial'` vs `'ovr'`.  
4. Visualize coefficient magnitudes for different `C` values.
