# **Licenciatura em Ciências da Computação**

### Aprendizagem Computacional 25/26

## Types of Supervised Learning Problems

### 1. Regression
- **Target variable is continuous**
- The model predicts a numeric value
- Examples:
  - House price prediction
  - Temperature forecasting
- Common metrics:
  - Mean Squared Error (MSE)
  - Mean Absolute Error (MAE)
  - R² score

---

### 2. Binary Classification
- **Target variable has two classes**
- The model predicts one of two possible categories
- Examples:
  - Spam vs Not Spam
  - Disease vs No Disease
- Common metrics:
  - Accuracy
  - Precision / Recall
  - F1-score
  - ROC-AUC

---

### 3. Multiclass Classification
- **Target variable has more than two classes**
- The model predicts one class out of multiple categories
- Examples:
  - Handwritten digit recognition (0–9)
  - Species classification (e.g., 3 flower types)
- Common metrics:
  - Accuracy
  - Macro / Weighted Precision, Recall, F1-score

In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)

df.head()

In [None]:
df["ocean_proximity"].unique()

In [None]:
def drop_na_rows(df: pd.DataFrame) -> pd.DataFrame:
    """Drop rows that contain any NaNs (simple baseline)."""
    return df.dropna(axis=0).reset_index(drop=True)

df_no_na = drop_na_rows(df)
df.shape, df_no_na.shape

In [None]:
# DO NOT USE ON LABEL/y!!
def one_hot_encode_pandas(df: pd.DataFrame, drop_first: bool = False) -> pd.DataFrame:
    cat_cols = df.select_dtypes(include=["object", "category"]).columns
    print('Categorical Columns: ', cat_cols)
    return pd.get_dummies(df, columns=cat_cols, drop_first=drop_first)

df_ohe = one_hot_encode_pandas(df_no_na, drop_first=True)
df_ohe.shape

In [None]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def make_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    num_cols = X.select_dtypes(include=[np.number]).columns
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns

    numeric = SimpleImputer(strategy="median")
    categorical = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ])

    return ColumnTransformer(
        transformers=[
            ("num", numeric, num_cols),
            ("cat", categorical, cat_cols),
        ],
        remainder="drop"
    )

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def regression_metrics(y_true, y_pred) -> dict:
    mse = mean_squared_error(y_true, y_pred)
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "MSE": mse,
        "RMSE": float(np.sqrt(mse)),
        "R2": r2_score(y_true, y_pred),
    }

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

# Target column in this dataset
TARGET = "median_house_value"

# Basic split
y = df[TARGET]
X = df.drop(columns=[TARGET])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Build pipeline
preprocessor = make_preprocessor(X_train)

model = Pipeline([
    ("prep", preprocessor),
    ("reg", Ridge(alpha=1.0)),
])

# Train + predict
model.fit(X_train, y_train)
pred = model.predict(X_test)

# Metrics
regression_metrics(y_test, pred)

In [None]:
#plot regression true vs predicted with red line
import matplotlib.pyplot as plt
plt.scatter(y_test, pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()

In [None]:
file_id = "1FHH_gE2c9tWqO2yBCibMgOSdgq3hSF_7"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)

df.head()

In [None]:
def drop_na_rows(df: pd.DataFrame) -> pd.DataFrame:
    """Drop rows that contain any NaNs (simple baseline)."""
    return df.dropna(axis=0).reset_index(drop=True)

df_no_na = drop_na_rows(df)
df.shape, df_no_na.shape


def one_hot_encode_pandas(df: pd.DataFrame, drop_first: bool = False) -> pd.DataFrame:
    cat_cols = df.select_dtypes(include=["object", "category"]).columns
    print('Categorical Columns: ', cat_cols)
    return pd.get_dummies(df, columns = cat_cols, drop_first=drop_first)

df_ohe = one_hot_encode_pandas(df_no_na, drop_first=True)
df_ohe.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


print("Generating pair plot...")
plt.figure(figsize=(10, 8))
sns.pairplot(df, hue='Meta_score', diag_kind='kde')
plt.suptitle('Pair Plot of Movies Dataset by Meta Score', y=1.00)
plt.show()

In [None]:
numerical_cols = ['Meta_score', 'Gross', 'IMDB_Rating', 'Released_Year', 'No_of_Votes', 'Runtime']

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 2, i + 1)
    sns.histplot(df[col], kde=True) 
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()