<a href="https://colab.research.google.com/github/JinzhiT/project-1/blob/main/project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Math 5750/6880: Mathematics of Data Science \\
Project 1

# 3. Python and Google Colab
Project Euler Problem  
https://projecteuler.net/

In [None]:
total = sum(i for i in range(1000) if i % 3 == 0 or i % 5 == 0)
print(total)

# 4. Regression Analysis
California housing data  
https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

In [None]:
# Load the California housing data
cal = fetch_california_housing(as_frame=True)
X, y = cal.data, cal.target
feature_names = X.columns
print(feature_names)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')


In [2]:
import os
import tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# -------------------------------
# 1. Extract and load the dataset
# -------------------------------

# Path to tgz file (make sure cal_housing.tgz is in the same folder as notebook)
tgz_path = "cal_housing.tgz"

# Extract tgz
with tarfile.open(tgz_path, "r:gz") as tar:
    tar.extractall(path=".")

# Check extracted files
print("Extracted files in ./CaliforniaHousing:", os.listdir("./CaliforniaHousing"))

# Load dataset
data_file = "./CaliforniaHousing/cal_housing.data"

cols = [
    "Longitude", "Latitude", "HouseAge", "TotalRooms",
    "TotalBedrooms", "Population", "Households",
    "MedInc", "MedHouseVal"
]
df = pd.read_csv(data_file, header=None, names=cols).dropna()

print("Dataset shape:", df.shape)
print(df.head())

# -------------------------------
# 2. Train/test split
# -------------------------------
feature_cols = [
    "MedInc", "HouseAge", "TotalRooms", "TotalBedrooms",
    "Population", "Households", "Latitude", "Longitude"
]
X = df[feature_cols]
y = df["MedHouseVal"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0
)

def metrics(y_true, y_pred):
    return {
        "R2": r2_score(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": mean_squared_error(y_true, y_pred, squared=False),
    }

# -------------------------------
# 3. Models
# -------------------------------

# Linear Regression
linreg = LinearRegression().fit(X_train, y_train)
lin_train = metrics(y_train, linreg.predict(X_train))
lin_test  = metrics(y_test,  linreg.predict(X_test))

# Random Forest
rf = RandomForestRegressor(n_estimators=300, random_state=0, n_jobs=-1)
rf.fit(X_train, y_train)
rf_train = metrics(y_train, rf.predict(X_train))
rf_test  = metrics(y_test,  rf.predict(X_test))

# Gradient Boosting
gbr = GradientBoostingRegressor(random_state=0).fit(X_train, y_train)
gbr_train = metrics(y_train, gbr.predict(X_train))
gbr_test  = metrics(y_test,  gbr.predict(X_test))

# -------------------------------
# 4. Pick best model and plot
# -------------------------------
model_preds = {
    "Linear": linreg.predict(X_test),
    "RandomForest": rf.predict(X_test),
    "GradientBoosting": gbr.predict(X_test),
}
best_name = max(model_preds, key=lambda k: r2_score(y_test, model_preds[k]))
best_pred = model_preds[best_name]

print("\nBest model based on Test R2:", best_name)

# Scatterplot
plt.figure(figsize=(6,6))
plt.scatter(y_test, best_pred, alpha=0.3)
lims = [min(y_test.min(), best_pred.min()), max(y_test.max(), best_pred.max())]
plt.plot(lims, lims, 'r--')  # 45-degree line
plt.xlabel("True Median House Value (USD)")
plt.ylabel("Predicted Median House Value (USD)")
plt.title(f"Predicted vs True ({best_name})")
plt.savefig("scatter_pred_true.png", bbox_inches="tight", dpi=150)
plt.close()

# Error histogram
errors = best_pred - y_test
plt.figure(figsize=(6,4))
plt.hist(errors, bins=40, edgecolor="black")
plt.xlabel("Prediction Error (USD)")
plt.ylabel("Frequency")
plt.title(f"Error Histogram ({best_name})")
plt.savefig("error_histogram.png", bbox_inches="tight", dpi=150)
plt.close()

print("Saved plots: scatter_pred_true.png, error_histogram.png")

# -------------------------------
# 5. Summaries
# -------------------------------
summary = pd.DataFrame({
    ("Linear","Train"): lin_train,
    ("Linear","Test"):  lin_test,
    ("RandomForest","Train"): rf_train,
    ("RandomForest","Test"):  rf_test,
    ("GradientBoosting","Train"): gbr_train,
    ("GradientBoosting","Test"):  gbr_test,
}).T

print("\nRegression metrics (R2, MAE, RMSE):")
print(summary.round(3))

# Feature importance (best model)
if best_name in ["RandomForest", "GradientBoosting"]:
    best_model = rf if best_name=="RandomForest" else gbr
    importances = pd.Series(best_model.feature_importances_,
                            index=feature_cols).sort_values(ascending=False)
else:
    scaler = StandardScaler().fit(X_train)
    coefs = LinearRegression().fit(scaler.transform(X_train), y_train).coef_
    importances = pd.Series(np.abs(coefs), index=feature_cols).sort_values(ascending=False)

print("\nFeature importance (best model):")
print(importances.round(4))


FileNotFoundError: [Errno 2] No such file or directory: 'cal_housing.tgz'

# 5. Classification Analysis
Diagnostic Wisconsin Breast Cancer Database  
https://scikit-learn.org/stable/datasets/toy_dataset.html#breast-cancer-dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Load Breast Cancer Wisconsin Dataset
data = load_breast_cancer(as_frame=True)
X = data.data
y = data.target                  # 0 = malignant, 1 = benign
feature_names = X.columns
label_names = {0: "malignant", 1: "benign"}
print(feature_names)

# Train/Test Split (stratified to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0, stratify=y)

# Preprocess Data (fit on train ONLY; then transform both)
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_std = scaler.fit_transform(X_train)   # fit on train
X_test_std  = scaler.transform(X_test)        # transform test with train stats

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')


In [None]:
# your code here