## EDA documento .JSON

Luis Angel Garcia (2230177)

## Librerias

In [None]:
import pandas as pd
import logging
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import jsonlines
import pandas as pd
import ast 
from IPython.display import display
from sklearn.preprocessing import StandardScaler, LabelEncoder

sys.path.append("../config_documents")

## Configuracion del login

In [None]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

## Cargar archivo

In [None]:
data = []
with jsonlines.open("../source/MLA_100k.jsonlines") as reader:
    for obj in reader:
        data.append(obj)
df = pd.DataFrame(data)

df.to_csv("../source/productos.csv", index=False)

df = pd.read_csv("../source/productos.csv")


In [None]:
display(df.info())

In [None]:
columnas_analizadas = [
    'seller_address', 'shipping', 'geolocation',
    'non_mercado_pago_payment_methods', 'pictures',
    'attributes', 'descriptions'
]

for col in columnas_analizadas:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else None)


df['latitude'] = df['seller_address'].apply(lambda x: x.get('latitude') if x else None)
df['longitude'] = df['seller_address'].apply(lambda x: x.get('longitude') if x else None)
df['country'] = df['seller_address'].apply(lambda x: x.get('country', {}).get('name') if x else None)
df['state'] = df['seller_address'].apply(lambda x: x.get('state', {}).get('name') if x else None)
df['city'] = df['seller_address'].apply(lambda x: x.get('city', {}).get('name') if x else None)
df['neighborhood'] = df['seller_address'].apply(lambda x: x.get('search_location', {}).get('neighborhood', {}).get('name') if x else None)

df['free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping') if x else None)
df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode') if x else None)
df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up') if x else None)

df['geo_lat'] = df['geolocation'].apply(lambda x: x.get('latitude') if x else None)
df['geo_lon'] = df['geolocation'].apply(lambda x: x.get('longitude') if x else None)

df['num_pictures'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0)

df['num_non_mp_methods'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0)

df['title_length'] = df['title'].apply(lambda x: len(x) if isinstance(x, str) else 0)

df['price_diff'] = df['price'] - df['base_price']

columnas_finales = [
    'id', 'title', 'condition', 'price', 'base_price', 'price_diff',
    'sold_quantity', 'available_quantity', 'accepts_mercadopago',
    'free_shipping', 'shipping_mode', 'local_pick_up',
    'latitude', 'longitude', 'country', 'state', 'city', 'neighborhood',
    'num_pictures', 'num_non_mp_methods', 'title_length'
]

df_final = df[columnas_finales].copy()


df_final.to_csv("../source/products_dataset.csv", index=False)

## Analisis basico

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)  
pd.set_option('display.max_rows', None)

display('Primeras filas del dataset:\n',df_final.head())
print(df.dtypes)
print(df['condition'].unique())
print(df['shipping_mode'].unique())
print(df.isnull().sum())

## Graficas

In [None]:
sns.countplot(x='condition', data=df)
plt.title('Distribución de la variable objetivo: Condition')
plt.show()

In [None]:
numerical_columns = df.select_dtypes(include=[float, int]).columns 
correlation_matrix = df[numerical_columns].corr()  

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Matriz de Correlación de Variables Numéricas')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['price'])
plt.title('Boxplot de Price')
plt.show()


plt.figure(figsize=(10, 6))
sns.scatterplot(x=df.index, y=df['price'], alpha=0.6)
plt.title('Dispersión de Price')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['price'], y=df['base_price'])
plt.title('Relación entre Price y Base Price')
plt.xlabel('Price')
plt.ylabel('Base Price')
plt.show()

## Limpieza y transformacion

In [None]:
def clean_columns(df_final):
    columns_to_remove = ['id', 'title', 'local_pick_up', 'latitude', 'longitude']  
    df_cleaned = df_final.drop(columns=columns_to_remove, errors='ignore')  
    return df_cleaned

def encode_condition(df_cleaned):
    if df_cleaned['condition'].dtype != 'object':  
        df_cleaned['condition'] = df_cleaned['condition'].map({1.0: 'new', 0.0: 'used'})
    
    le = LabelEncoder()
    df_cleaned['condition'] = le.fit_transform(df_cleaned['condition'])
    
    df_cleaned['condition'] = df_cleaned['condition'].astype('int')
    
    print(f"Valores únicos en 'condition' después de la codificación: {df_cleaned['condition'].unique()}")
    print(f"Tipo de 'condition' después de la conversión: {df_cleaned['condition'].dtype}")
    
    return df_cleaned

def encode_categorical_columns(df_cleaned):
    categorical_columns = ['shipping_mode', 'accepts_mercadopago', 'free_shipping', 'country', 'state', 'city', 'neighborhood']
    
    for col in categorical_columns:
        df_cleaned[col] = df_cleaned[col].astype('category').cat.codes  

    return df_cleaned

def scale_features(df_cleaned, numerical_columns):
    scaler = StandardScaler()
    df_cleaned[numerical_columns] = scaler.fit_transform(df_cleaned[numerical_columns])
    return df_cleaned

def preprocess_data(df_final):
    df_cleaned = clean_columns(df_final)
    
    df_cleaned = encode_condition(df_cleaned)
    
    df_cleaned = encode_categorical_columns(df_cleaned)
    
    numerical_columns = df_cleaned.select_dtypes(include=np.number).columns

    df_cleaned = scale_features(df_cleaned, numerical_columns)
    
    df_cleaned['condition'] = df_cleaned['condition'].astype(int)

    X = df_cleaned.drop(columns=['condition'])  
    y = df_cleaned['condition']  
    
    df_cleaned.to_csv('../source/dataset_limpio.csv', index=False)

    print(f"Tipo de 'condition' después de guardar como CSV: {df_cleaned['condition'].dtype}")
    
    return X, y



X, y = preprocess_data(df_final)


