# Trabajo 2 - IA

Resumen aqui


In [1]:
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, precision_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

path = kagglehub.dataset_download("therohithanand/used-car-price-prediction", path="used_car_price_dataset_extended.csv")

df = pd.read_csv(path)
df = df.reset_index(drop=True)

### Limpiando el Dataframe

In [3]:
df = df.drop(columns=['color', 'service_history'])
df.head()

Unnamed: 0,make_year,mileage_kmpl,engine_cc,fuel_type,owner_count,price_usd,brand,transmission,accidents_reported,insurance_valid
0,2001,8.17,4000,Petrol,4,8587.64,Chevrolet,Manual,0,No
1,2014,17.59,1500,Petrol,4,5943.5,Honda,Manual,0,Yes
2,2023,18.09,2500,Diesel,5,9273.58,BMW,Automatic,1,Yes
3,2009,11.28,800,Petrol,1,6836.24,Hyundai,Manual,0,Yes
4,2005,12.23,1000,Petrol,2,4625.79,Nissan,Automatic,0,Yes


In [4]:
cols_priority = (df.corr(numeric_only=True).abs()['price_usd'].sort_values(ascending=False)).index[1:]
print(cols_priority.values)

['engine_cc' 'make_year' 'owner_count' 'mileage_kmpl' 'accidents_reported']


In [5]:
# Inicializa las categorias para las variables categoricas que son numeros
categories = {1: "Very Low",2: "Low",3: "Moderate",4: "High", 5: "Very High"}

#Mapea los categorias
#Convierte todo a categorias en vez de objetos
categorical_cols = df.select_dtypes(include='object').columns

df[categorical_cols] = df[categorical_cols].astype('category')


#Se categoriza la variable objetivo en 4 clases
quartiles_GPA = df['price_usd'].quantile([0.25, 0.5, 0.75])

bins = [0, quartiles_GPA[0.25], quartiles_GPA[0.5], quartiles_GPA[0.75], df['price_usd'].max()]
labels = ['Very Low', 'Low', 'Moderate', 'High']

df['price_usd'] = pd.cut(df['price_usd'], bins=bins, labels=labels, include_lowest=True)

#Insertar variable dependiente de primero por comodidad
price_serie = df['price_usd']
df = df.drop(columns=['price_usd'])
df.insert(0, 'price_usd', price_serie)

df.head()


Unnamed: 0,price_usd,make_year,mileage_kmpl,engine_cc,fuel_type,owner_count,brand,transmission,accidents_reported,insurance_valid
0,Moderate,2001,8.17,4000,Petrol,4,Chevrolet,Manual,0,No
1,Low,2014,17.59,1500,Petrol,4,Honda,Manual,0,Yes
2,High,2023,18.09,2500,Diesel,5,BMW,Automatic,1,Yes
3,Low,2009,11.28,800,Petrol,1,Hyundai,Manual,0,Yes
4,Very Low,2005,12.23,1000,Petrol,2,Nissan,Automatic,0,Yes


In [6]:
def create_df_variants(df: pd.DataFrame, outliers: bool = False, balanced: bool = True) -> pd.DataFrame:
    """
    Transforma el df original:

    Parámetros:
    - df: DataFrame de entrada.
    - outliers: Si True, dataframe con 5% outliers, si no se eliminan todos
    - balanced: Si True, variable objetiva balanceada

    Retorna:
    - Df transformado
    """
    # Elimina las filas, donde uno de los valores numericos este fuera de los intervalos
    def delete_outliers(df: pd.DataFrame):
        df_clean = df.copy()
        float_cols = df_clean.select_dtypes('float64').columns

        for col in float_cols:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)

            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

        return df_clean

    def get_5outliers(df: pd.DataFrame, random_state: int = 666) -> pd.DataFrame:
        """
        A partir del df original, detecta sus outliers y genera (replicándolos
        y añadiendo un pequeño ruido) tantas tuplas como hagan falta para que
        los outliers representen el 5% del total.

        Parámetros:
        - df: DataFrame de entrada.
        - random_state: semilla para la aleatoriedad.

        Retorna:
        - DataFrame con 5% outliers.
        """
        df_copy = df.copy()
        float_cols = df_copy.select_dtypes(include='float64').columns

        outlier_mask = pd.Series(False, index=df_copy.index)
        for col in float_cols:
            Q1 = df_copy[col].quantile(0.25)
            Q3 = df_copy[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            outlier_mask |= (df_copy[col] < lower) | (df_copy[col] > upper)

        outliers = df_copy[outlier_mask]
        non_outliers = df_copy[~outlier_mask]

        total_orig = len(df_copy)
        n_outliers = len(outliers)
        target_outliers = int(np.floor(0.05 * total_orig))

        # Si hay menos outliers que el 5%, generamos más
        needed = target_outliers - n_outliers
        synth = outliers.sample(n=needed, replace=True, random_state=random_state)
        # Clonamos los outliers existentes pero les añadimos  ruido
        for col in float_cols:
            sigma = df_copy[col].std() * 0.01  
            synth[col] = synth[col] + np.random.RandomState(random_state).normal(0, sigma, size=needed)
        result = pd.concat([df_copy, synth], ignore_index=True)

        return result

    df_transformed = df.copy()

    # Con 5% outliers o sin ninguno●●●●●●●●●●●●●●●●●●●
    df_transformed = get_5outliers(df_transformed) if outliers else delete_outliers(df_transformed)

    return df_transformed

def divide_and_normalize(df: pd.DataFrame, normalize: bool = False):
    """
    Divide un DataFrame en X_train, X_test, y_train, y_test.

    Parámetros:
    - df: DataFrame de entrada.
    - normalize: Si True, se aplica normalización estándar a las características.

    Retorna:
    - X_train, X_test, y_train, y_test
    """
    # Separar características (X) y etiquetas (y)
    X = df.drop(columns='price_usd')
    y = df['price_usd']

    X = pd.get_dummies(X, drop_first=True)

    # Dividir el conjunto de datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Normalizar si se indica
    if normalize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test



## KNN

In [None]:


X = df.drop(columns='price_usd')
y = df['price_usd']

# Dividir el conjunto de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_features = df.select_dtypes(include='number').columns
cat_features = df.select_dtypes(include='category').drop(columns='price_usd').columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(drop='first'), cat_features)
    ]
)

pipe = Pipeline([
    ('preprocessor', preprocessor),             
    ('knn', KNeighborsClassifier())             
])

param_grid = {
    'knn__n_neighbors': list(range(2,50))        # Vecinos para KNN
}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, scoring='accuracy')

grid.fit(X_train, y_train)

print("Mejores parámetros encontrados:", grid.best_params_)
print(f"Mejor score CV: {grid.best_score_:.3f}")

y_pred = grid.predict(X_test)
print("\nReporte en test set:\n", classification_report(y_test, y_pred))


Mejores parámetros encontrados: {'knn__n_neighbors': 29}
Mejor score CV: 0.635

Reporte en test set:
               precision    recall  f1-score   support

        High       0.84      0.76      0.80       489
         Low       0.51      0.54      0.52       512
    Moderate       0.49      0.48      0.48       474
    Very Low       0.77      0.80      0.78       525

    accuracy                           0.65      2000
   macro avg       0.65      0.65      0.65      2000
weighted avg       0.65      0.65      0.65      2000

Con outliers
