# Ch03 Assignment: Linear Regression on College (ISLP)


1) Load `College` via ISLP; display first 10 rows; list all columns; briefly describe key variables.
2) Simple regression: `Outstate ~ Top10perc`; report coefficients and interpret sign/size.
3) Multiple regression: `Outstate ~ Top10perc + Room.Board + PhD`; report and interpret.
4) Add quadratic term for `Top10perc`; report and interpret the quadratic effect.
5) Compare linear vs quadratic with `anova_lm`; report F-statistic, p-value, and conclusion.



In [None]:
# Install ISLP if missing; import libraries
import sys, subprocess

def ensure(pkg):
    try:
        __import__(pkg)
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "--quiet"])    

for p in ["ISLP", "pandas", "numpy", "statsmodels", "matplotlib", "seaborn"]:
    ensure(p.split("[")[0])

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from ISLP import load_data

pd.set_option('display.float_format', lambda x: f"{x:,.2f}")
print("Versions -> pandas:", pd.__version__, ", statsmodels:", sm.__version__)


In [None]:
# Load and inspect dataset
college = load_data('College')
print(f"Shape: {college.shape}")
first10 = college.head(10)
first10


In [None]:
# List all column names
columns = list(college.columns)
columns


### Brief dataset description and key variables

The `College` dataset contains information about U.S. colleges, including tuition, admissions selectivity, expenditures, and faculty characteristics.

- **Outstate**: Out-of-state tuition (USD).
- **Private**: College control (`Yes`/`No`).
- **Room.Board**: Annual room and board costs (USD).
- **PhD**: Percentage of faculty with PhD degrees.
- **Top10perc**: Percentage of new students from the top 10% of their high school class.


In [None]:
# Simple linear regression: Outstate ~ Top10perc
model_simple = smf.ols('Outstate ~ Top10perc', data=college).fit()
print(model_simple.summary())

# Coefficient report and brief interpretation
intercept = model_simple.params['Intercept']
slope = model_simple.params['Top10perc']
print(f"\nEstimated coefficients:\n  Intercept: {intercept:,.2f}\n  Top10perc: {slope:,.2f} (USD per 1% increase)")
print("Interpretation: Holding other factors aside (simple model), higher Top10perc is associated with a change in Outstate by the slope per percentage point.")


In [None]:
# Multiple linear regression: Outstate ~ Top10perc + Room.Board + PhD
model_multi = smf.ols('Outstate ~ Top10perc + Room.Board + PhD', data=college).fit()
print(model_multi.summary())

print("\nCoefficient interpretations (holding others constant):")
for name in ['Top10perc', 'Room.Board', 'PhD']:
    print(f"  {name}: {model_multi.params[name]:,.2f}")


In [None]:
# Quadratic term for Top10perc
model_quad = smf.ols('Outstate ~ Top10perc + I(Top10perc**2) + Room.Board + PhD', data=college).fit()
print(model_quad.summary())

quad_coef = model_quad.params['I(Top10perc ** 2)']
print(f"\nQuadratic term coefficient (Top10perc^2): {quad_coef:,.4f}")
print("Interpretation: A significant nonzero coefficient suggests a nonlinear relationship between Top10perc and Outstate.")


In [None]:
# ANOVA comparison: linear vs quadratic models
model_linear = smf.ols('Outstate ~ Top10perc + Room.Board + PhD', data=college).fit()
anova_results = anova_lm(model_linear, model_quad)
print(anova_results)

f_stat = anova_results['F'][1]
p_val = anova_results['Pr(>F)'][1]
print(f"\nF-statistic: {f_stat:,.3f}, p-value: {p_val:.4g}")
print("Conclusion:", "Quadratic term significantly improves the model." if p_val < 0.05 else "No significant improvement from quadratic term.")
