In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [11]:
df = pd.read_csv("/Users/lucas/Documents/Lucas /projetos/cars/data/raw.csv")

In [12]:
#Ajustando nome das colunas com snake_case e renomeando:
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.rename(columns = {'year' : 'car_year', 'make' : 'manufacturer', 'trim' : 'version', 'vin' : 'chassis_number', 'mmr' : 'valuation', 'sellingprice' : 'selling_price', 'saledate' : 'sale_date'}, inplace = True)

In [13]:
def lower_transform(df:pd.DataFrame)->pd.DataFrame:
    for coluna in df.columns:
        if df[coluna].dtype == 'object':
            df[coluna] = df[coluna].str.lower()
    return df

In [14]:
def clean_df(df:pd.DataFrame)->pd.DataFrame:
    df.drop(df[df['transmission'] == 'sedan'].index, axis='index', inplace=True)
 
    df.loc[df['state'].apply(lambda x: len(x) > 3), 'state'] = 'other'
    
    df['interior'].replace('—', 'other', inplace=True)
    
    df.sort_values(by=['model', 'version'], inplace=True)
    
    df['transmission'].fillna(method='ffill', inplace=True)
    df['body'].fillna(method='ffill', inplace=True)
    
    df.dropna(axis=0, inplace=True)
    
    df['color'] = df['color'].apply(lambda x: x if isinstance(x, str) else 'other')
    
    df['color'].replace('—', 'other', inplace=True)
    
    meses_dict = {
        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
        'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
        'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }
    df['week_day'] = df['sale_date'].str.slice(0, 3)
    df['month'] = df['sale_date'].str.slice(4, 7).map(meses_dict)
    df['day'] = df['sale_date'].str.slice(8, 10).astype(int)
    df['year'] = df['sale_date'].str.slice(11, 15).astype(int)
    
    df.drop('sale_date', axis=1, inplace=True)
    return df

In [15]:
def normalize_data(df:pd.DataFrame)->pd.DataFrame:
    """
    Normaliza os dados em um DataFrame usando a técnica de normalização min-max.
    
    Parâmetros:
        df (DataFrame): DataFrame contendo os dados a serem normalizados.
        
    Retorna:
        DataFrame: DataFrame com os dados normalizados.
    """
  
    df_normalized = df.copy()
    

    for column_name in df.columns:
       
        if df[column_name].dtype in ['int64', 'float64']:
            
            min_value = df[column_name].min()
            max_value = df[column_name].max()
            
            
            df_normalized[column_name] = (df[column_name] - min_value) / (max_value - min_value)
    
    return df_normalized


In [16]:
def label_encode_categoricals(df:pd.DataFrame)->pd.DataFrame:
    """
    Aplica a codificação de rótulos para colunas categóricas em um DataFrame.

    Parâmetros:
        df: DataFrame
            O DataFrame contendo as colunas categóricas a serem codificadas.

    Retorna:
        df_encoded: DataFrame
            O DataFrame com as colunas categóricas codificadas.
    """
    df_encoded = df.copy()
    label_encoder = LabelEncoder()
    for col in df_encoded.columns:
        if df_encoded[col].dtype == 'object':
            df_encoded[col] = label_encoder.fit_transform(df_encoded[col])
    return df_encoded

df_encoded = label_encode_categoricals(df)



In [17]:
def create_abt(df:pd.DataFrame):
    abt = lower_transform(df)
    abt = clean_df(df=abt)
    abt = normalize_data(df=abt)
    abt = label_encode_categoricals(df=abt)
    abt.to_csv('../data/abt.csv', index=False)


In [18]:
create_abt(df=df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['interior'].replace('—', 'other', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['transmission'].fillna(method='ffill', inplace=True)
  df['transmission'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will ne