In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv("train.csv")

# Explore shape and summary
print(df.shape)
df.head()
df.info()
df.describe()

# Null value analysis
df.isnull().sum()


(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# Drop irrelevant columns
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [5]:
# Fill missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [6]:
# Encode categorical variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [8]:
# Feature and label separation
X = df.drop("Survived", axis=1)
y = df["Survived"]

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Model training
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

In [11]:
# Prediction and evaluation
y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7988826815642458
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [33]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score

In [34]:
# Data Loading 
data = load_breast_cancer()
X = pd.DataFrame(data.data,columns = data.feature_names)
y = pd.Series(data.target)

In [37]:
# Train test Split 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42 ,  stratify=y )

In [38]:
# Scale features
scaler = StandardScaler()
# StandardScaler computes per-feature: z = (x - mean) / std on TRAIN ONLY (to avoid leakage)

X_train_scaled = scaler.fit_transform(X_train)
# fit_transform:
#   - computes means/stds on X_train (shape (455,30))
#   - returns a NumPy array of same shape with standardized values

X_test_scaled = scaler.transform(X_test)
# transform uses TRAIN means/stds to scale X_test (no refitting!)

# Train logistic regression (with L2 regularization default)
clf = LogisticRegression(
    max_iter=10000,   # increase iterations to ensure convergence
    solver='liblinear'  # good for small binary datasets; supports L1/L2
)
# Notes:
# - penalty='l2' by default (ridge-like).
# - C=1.0 by default (inverse regularization strength). Smaller C -> stronger reg.
# - multi_class='ovr' by default for liblinear.

clf.fit(X_train_scaled, y_train)
# Fits parameters (weights w ∈ R^30 and intercept b) to minimize log-loss + L2 penalty.

y_pred = clf.predict(X_test_scaled)
# Hard labels (0/1) using default threshold 0.5 on predicted probability.

y_proba = clf.predict_proba(X_test_scaled)[:, 1]
# Predicts class probabilities per sample (shape (114,2)).
# [:,1] extracts P(y=1|x) for ROC AUC/PR curves.

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# 2x2 matrix:
# [[TN, FP],
#  [FN, TP]]

print(classification_report(y_test, y_pred))
# Precision, recall, f1-score per class + averages, plus support counts.

print("ROC AUC:", roc_auc_score(y_test, y_proba))
# Threshold-independent performance using probabilities (not labels).
# 1.0 is perfect; 0.5 is random guessing.

# Inspect coefficients (feature importance direction)
coefs = pd.Series(clf.coef_[0], index=X.columns).sort_values(key=abs, ascending=False)
# clf.coef_: shape (1, 30) for binary classification; take [0] to get 1D array length 30.
# index=X.columns aligns coefficients with original feature names (order is preserved through scaling).
# sort_values(key=abs, ascending=False): sorts by absolute magnitude (most influential first).

print(coefs.head(10))
# Shows top-10 features by |coefficient|. Positive -> increases log-odds for class 1; negative -> decreases.

Confusion Matrix:
 [[41  1]
 [ 1 71]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

ROC AUC: 0.9957010582010581
worst texture          -1.242272
radius error           -1.087929
worst area             -0.979282
area error             -0.958096
worst radius           -0.946000
worst concave points   -0.945296
worst symmetry         -0.928729
worst concavity        -0.827180
worst perimeter        -0.764807
worst smoothness       -0.759567
dtype: float64
