# EXAMEN PARCIAL CC442 - SOLUCIÓN COMPLETA
## Minería de Datos (8vo Ciclo)
### Problemas a resolver:
1. Automobile Accidents (Naive Bayes)
2. Toyota Corolla Prices (Regresión Lineal)
3. Spam Detection (LDA)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")
print("Importaciones completadas")

---
# PROBLEMA 1: AUTOMOBILE ACCIDENTS
## Clasificación con Naive Bayes

In [None]:
accidents_df = pd.read_csv("AccidentsFull.csv")
print(f"Dataset: {accidents_df.shape}")
accidents_df["INJURY"] = accidents_df["MAX_SEV_IR"].apply(lambda x: "yes" if x in [1, 2] else "no")
print(accidents_df["INJURY"].value_counts())

In [None]:
sample_12 = accidents_df[["INJURY", "WEATHER_R", "TRAF_CON_R"]].head(12)
X_12 = pd.get_dummies(sample_12[["WEATHER_R", "TRAF_CON_R"]])
y_12 = sample_12["INJURY"]

nb = MultinomialNB(alpha=0.01)
nb.fit(X_12, y_12)
proba = nb.predict_proba(X_12)
pred = nb.predict(X_12)

print("Predicciones con Naive Bayes (12 registros):")
for i in range(len(pred)):
    print(f"{i}: P(no)={proba[i,0]:.4f}, P(yes)={proba[i,1]:.4f}, Pred={pred[i]}, Real={y_12.iloc[i]}")

In [None]:
X_full = pd.get_dummies(accidents_df[["WEATHER_R", "TRAF_CON_R"]])
y_full = accidents_df["INJURY"]

X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.4, random_state=1, stratify=y_full)

print(f"Entrenamiento: {len(X_train)}")
print(f"Validación: {len(X_val)}")

nb_full = MultinomialNB(alpha=0.01)
nb_full.fit(X_train, y_train)

y_pred = nb_full.predict(X_val)
cm = confusion_matrix(y_val, y_pred, labels=["no", "yes"])

print("\nMatriz de Confusión:")
print(cm)
print(f"\nAccuracy: {accuracy_score(y_val, y_pred):.4f}")

---
# PROBLEMA 2: TOYOTA COROLLA PRICES
## Regresión Lineal Múltiple

In [None]:
toyota_df = pd.read_csv("ToyotaCorolla.csv")
print(f"Dataset: {toyota_df.shape}")

predictores = ["Age_08_04", "KM", "Fuel_Type", "HP", "Automatic", "Doors", "Quarterly_Tax", "Mfr_Guarantee", "Guarantee_Period", "Airco", "Automatic_airco", "CD_Player", "Powered_Windows", "Sport_Model", "Tow_Bar"]

toyota_clean = toyota_df[["Price"] + predictores].dropna()
print(f"Datos limpios: {toyota_clean.shape}")

toyota_encoded = pd.get_dummies(toyota_clean, columns=["Fuel_Type"], drop_first=True)
X_toyota = toyota_encoded.drop("Price", axis=1)
y_toyota = toyota_encoded["Price"]

print(f"Variables: {X_toyota.shape}")

In [None]:
X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(X_toyota, y_toyota, test_size=0.3, random_state=1)

lr = LinearRegression()
lr.fit(X_train_t, y_train_t)

y_pred_train = lr.predict(X_train_t)
y_pred_val = lr.predict(X_val_t)

print(f"R² Train: {lr.score(X_train_t, y_train_t):.4f}")
print(f"R² Val: {lr.score(X_val_t, y_val_t):.4f}")
print(f"RMSE Val: {np.sqrt(mean_squared_error(y_val_t, y_pred_val)):,.2f}")

In [None]:
coef_df = pd.DataFrame({"Predictor": X_toyota.columns, "Coef": lr.coef_}).sort_values("Coef", key=abs, ascending=False)

print("Top 5 predictores:")
print(coef_df.head(5))

---
# PROBLEMA 3: SPAM DETECTION
## Linear Discriminant Analysis

In [None]:
spam_df = pd.read_csv("spambase.csv")
print(f"Dataset: {spam_df.shape}")

X_spam = spam_df.drop("Spam", axis=1)
y_spam = spam_df["Spam"]

spam_class = X_spam[y_spam == 1].mean()
nonspam_class = X_spam[y_spam == 0].mean()

diff = np.abs(spam_class - nonspam_class).sort_values(ascending=False)

print("Top 11 predictores que diferencian spam vs no-spam:")
top_11 = diff.head(11)
for i, (feat, val) in enumerate(top_11.items(), 1):
    print(f"{i:2d}. {feat:20s}: {val:.6f}")

In [None]:
top_11_preds = diff.head(11).index.tolist()
X_spam_selected = X_spam[top_11_preds]

X_train_spam, X_val_spam, y_train_spam, y_val_spam = train_test_split(X_spam_selected, y_spam, test_size=0.3, random_state=1, stratify=y_spam)

lda = LinearDiscriminantAnalysis()
lda.fit(X_train_spam, y_train_spam)

y_pred_spam = lda.predict(X_val_spam)

cm_spam = confusion_matrix(y_val_spam, y_pred_spam)

print("Matriz de Confusión LDA:")
print(cm_spam)
print(f"\nAccuracy: {accuracy_score(y_val_spam, y_pred_spam):.4f}")
print(classification_report(y_val_spam, y_pred_spam, target_names=["No-Spam", "Spam"]))

---
## RESUMEN FINAL
Examen Parcial CC442 - Todos los problemas resueltos