1. Predict whether a student gets admitted to graduate school using GRE, GPA, and rank (prestige of undergraduate institution).

In [None]:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
url = "https://stats.idre.ucla.edu/stat/stata/dae/binary.dta"
data = pd.read_stata(url)

# View dataset
print(data.head())

# Define predictors and target
X = data[['gre', 'gpa', 'rank']]
X = pd.get_dummies(X, columns=['rank'], drop_first=True)  # convert categorical variable

# Convert boolean columns to integers
for col in X.columns:
    if X[col].dtype == bool:
        X[col] = X[col].astype(int)

y = data['admit']

# Add constant for statsmodels
X = sm.add_constant(X)

# Logistic Regression using statsmodels
model = sm.Logit(y, X).fit()
print(model.summary())

# Predict and evaluate
pred_probs = model.predict(X)
predictions = (pred_probs >= 0.5).astype(int)

from sklearn.metrics import confusion_matrix, accuracy_score
print("Confusion Matrix:\n", confusion_matrix(y, predictions))
print("Accuracy:", accuracy_score(y, predictions))

   admit    gre   gpa  rank
0    0.0  380.0  3.61   3.0
1    1.0  660.0  3.67   3.0
2    1.0  800.0  4.00   1.0
3    1.0  640.0  3.19   4.0
4    0.0  520.0  2.93   4.0
Optimization terminated successfully.
         Current function value: 0.573147
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  400
Model:                          Logit   Df Residuals:                      394
Method:                           MLE   Df Model:                            5
Date:                Wed, 23 Jul 2025   Pseudo R-squ.:                 0.08292
Time:                        05:31:45   Log-Likelihood:                -229.26
converged:                       True   LL-Null:                       -249.99
Covariance Type:            nonrobust   LLR p-value:                 7.578e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------

Rank is treated as a categorical variable with dummy encoding.

Statsmodels provides a detailed regression output including p-values, coefficients, and more.

2.Predict whether a person has a disease based on input variables.

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Simulated dataset
data = pd.DataFrame({
    'age': [25, 45, 35, 50, 23, 33, 42, 36, 29, 60],
    'bp': [120, 140, 130, 150, 110, 125, 135, 128, 119, 155],
    'cholesterol': [180, 220, 200, 240, 170, 190, 210, 195, 185, 250],
    'disease': [0, 1, 0, 1, 0, 0, 1, 0, 0, 1]
})

X = data[['age', 'bp', 'cholesterol']]
y = data['disease']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print("Classification Report:\n", classification_report(y_test, preds))


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



Predicts likelihood of having a disease based on health indicators.

Uses scikit-learn for training and evaluation.

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Simulated dataset
data = pd.DataFrame({
    'age': [25, 45, 35, 50, 23, 33, 42, 36, 29, 60],
    'bp': [120, 140, 130, 150, 110, 125, 135, 128, 119, 155],
    'cholesterol': [180, 220, 200, 240, 170, 190, 210, 195, 185, 250],
    'disease': [0, 1, 0, 1, 0, 0, 1, 0, 0, 1]
})

X = data[['age', 'bp', 'cholesterol']]
y = data['disease']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print("Classification Report:\n", classification_report(y_test, preds))


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



3. Online Retail Product Prediction

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Simulated dataset
data = pd.DataFrame({
    'age': [21, 34, 25, 45, 31, 29, 41, 38, 22, 27],
    'income': [30000, 60000, 35000, 75000, 50000, 40000, 68000, 62000, 32000, 45000],
    'viewed_product_A': [1, 0, 1, 0, 1, 1, 0, 0, 1, 1],
    'purchased_product_A': [1, 0, 1, 0, 1, 1, 0, 0, 0, 1]
})

X = data[['age', 'income', 'viewed_product_A']]
y = data['purchased_product_A']

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print("Classification Report:\n", classification_report(y_test, preds))


Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Features like age, income, and product views predict if a customer buys a product.

This uses logistic regression to classify purchase decisions.