# Trabajo 2 - IA

Resumen aqui


In [36]:
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [37]:

path = kagglehub.dataset_download("therohithanand/used-car-price-prediction", path="used_car_price_dataset_extended.csv")

df = pd.read_csv(path)
df = df.reset_index(drop=True)

### Limpiando el Dataframe

In [38]:
df = df.drop(columns=['color', 'service_history'])
df.head()

Unnamed: 0,make_year,mileage_kmpl,engine_cc,fuel_type,owner_count,price_usd,brand,transmission,accidents_reported,insurance_valid
0,2001,8.17,4000,Petrol,4,8587.64,Chevrolet,Manual,0,No
1,2014,17.59,1500,Petrol,4,5943.5,Honda,Manual,0,Yes
2,2023,18.09,2500,Diesel,5,9273.58,BMW,Automatic,1,Yes
3,2009,11.28,800,Petrol,1,6836.24,Hyundai,Manual,0,Yes
4,2005,12.23,1000,Petrol,2,4625.79,Nissan,Automatic,0,Yes


In [39]:
cols_priority = (df.corr(numeric_only=True).abs()['price_usd'].sort_values(ascending=False)).index[1:]
print(cols_priority)

Index(['engine_cc', 'make_year', 'owner_count', 'mileage_kmpl',
       'accidents_reported'],
      dtype='object')


In [40]:
# Inicializa las categorias para las variables categoricas que son numeros
categories = {1: "Very Low",2: "Low",3: "Moderate",4: "High", 5: "Very High"}

#Mapea los categorias
#Convierte todo a categorias en vez de objetos
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].astype('category')


#Se categoriza la variable objetivo en 4 clases
quartiles_GPA = df['price_usd'].quantile([0.25, 0.5, 0.75])

bins = [0, quartiles_GPA[0.25], quartiles_GPA[0.5], quartiles_GPA[0.75], df['price_usd'].max()]
labels = ['Very Low', 'Low', 'Moderate', 'High']

df['price_usd'] = pd.cut(df['price_usd'], bins=bins, labels=labels, include_lowest=True)

#Insertar variable dependiente de primero por comodidad
price_serie = df['price_usd']
df = df.drop(columns=['price_usd'])
df.insert(0, 'price_usd', price_serie)

df.head()

Unnamed: 0,price_usd,make_year,mileage_kmpl,engine_cc,fuel_type,owner_count,brand,transmission,accidents_reported,insurance_valid
0,Moderate,2001,8.17,4000,Petrol,4,Chevrolet,Manual,0,No
1,Low,2014,17.59,1500,Petrol,4,Honda,Manual,0,Yes
2,High,2023,18.09,2500,Diesel,5,BMW,Automatic,1,Yes
3,Low,2009,11.28,800,Petrol,1,Hyundai,Manual,0,Yes
4,Very Low,2005,12.23,1000,Petrol,2,Nissan,Automatic,0,Yes


In [41]:
def create_df_variants(df: pd.DataFrame, outliers: bool = False, normalize: bool = False, balanced: bool = True) -> pd.DataFrame:
    # Elimina las filas, donde uno de los valores numericos este fuera de los intervalos
    def delete_outliers(df: pd.DataFrame):
        df_clean = df.copy()
        float_cols = df_clean.select_dtypes('float64').columns

        for col in float_cols:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)

            IQR = Q3 - Q1

            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

        return df_clean

    # Devuelve un df con el 5% outliers, debido a los datos del df, devuelve menos tuplas
    def get_5outliers(df: pd.DataFrame):
        df_copy = df.copy()
        float_cols = df_copy.select_dtypes(include='float64').columns

        outlier_mask = pd.Series(False, index=df_copy.index)

        for col in float_cols:
            Q1 = df_copy[col].quantile(0.25)
            Q3 = df_copy[col].quantile(0.75)

            IQR = Q3 - Q1

            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

            col_outliers = (df_copy[col] < lower) | (df_copy[col] > upper)
            outlier_mask |= col_outliers

        outliers = df_copy[outlier_mask]
        n_outliers = len(outliers)

        non_outliers = df_copy[~outlier_mask]
        sample_size = min(len(non_outliers), int(n_outliers * (1 - 0.05) / 0.05))
        sample = non_outliers.sample(n=sample_size, random_state=666)

        result_df = pd.concat([sample, outliers])

        return result_df

    # Normaliza los datos numericos del df con MinMaxScaler
    def normalize_df(df: pd.DataFrame):
        df_norm = df.copy()
        float_cols = df_norm.select_dtypes(include='float64').columns

        scaler = MinMaxScaler()
        df_norm[float_cols] = scaler.fit_transform(df_norm[float_cols])

        return df_norm

    df_transformed = df.copy()

    # Con 5% outliers o sin ninguno
    df_transformed = get_5outliers(df_transformed) if outliers else delete_outliers(df_transformed)

    if normalize:
        df_transformed = normalize_df(df_transformed)

    return df_transformed



