## **Librerías**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_columns = False

In [3]:
from datetime import datetime, timedelta
import winsound as cst

In [4]:
import sys
sys.path.append('./')
sys.path.append('../')

In [None]:
import xml.etree.ElementTree as ET
import hashlib

In [97]:
import warnings
warnings.filterwarnings('ignore')

## **Variables Globales**

In [5]:
dia = int(str(datetime.now()).replace('-', '')[:8])
mes = int(str(datetime.now()).replace('-', '')[:6])

## **Datos**

In [7]:
path = '../Data/Raw/feed.xml'

In [8]:
# Cargar el archivo XML
tree = ET.parse(path)
root = tree.getroot()

In [11]:
data = []

for listing in root.findall("listing"):
    data.append({
        "state": listing.find("state").text,
        "city": listing.find("city").text,
        "colony": listing.find("colony").text,
        "street": listing.find("street").text,
        "external_num": listing.find("external_num").text,
        "code": listing.find("code").text,
        "type": listing.find("type").text,
        "purpose": listing.find("purpose").text,
        "price": float(listing.find("price").text),  # Convertir a número
        "mail_contact": listing.find("mail_contact").text,
        "phone_contact": listing.find("phone_contact").text
    })

In [12]:
df_raw = pd.DataFrame(data)

In [128]:
df_raw.to_csv('../Data/Processed/Habi_BBDD.csv', index=False, encoding='utf-8', sep='|')

## **Funciones**

In [65]:
def generate_numeric_id(df: pd.DataFrame, dimensions: list, id_column: str = "unique_id") -> pd.DataFrame:
    """
    Genera un ID único numérico basado en n dimensiones de un DataFrame.

    :param df: DataFrame de Pandas con los datos.
    :param dimensions: Lista de nombres de columnas que se usarán para generar el ID.
    :param id_column: Nombre de la nueva columna donde se guardará el ID único.
    :return: DataFrame con la columna de ID único agregada.
    """
    if not all(dim in df.columns for dim in dimensions):
        raise ValueError("Algunas dimensiones no existen en el DataFrame")

    def hash_function(row):
        combined = "_".join(str(row[dim]) for dim in dimensions)  # Concatenar valores
        hash_hex = hashlib.sha256(combined.encode()).hexdigest()  # Generar hash
        return int(hash_hex[:10], 16) % 10**10  # Convertir a número y limitar a 10 dígitos
    
    df[id_column] = df.apply(hash_function, axis=1)
    return df

## **Limpieza**

In [77]:
df_raw.columns = [i.upper() for i in df_raw.columns]

In [78]:
df_raw = generate_numeric_id(
    df_raw, 
    ['STATE', 'CITY', 'COLONY', 'STREET', 'CODE', 'PRICE', 'PHONE_CONTACT'], 
    'PROPERTIE_ID'
)

### **Users dataframe**

In [81]:
df_users = df_raw[['CODE', 'MAIL_CONTACT', 'PHONE_CONTACT']]

In [82]:
df_users.rename(columns= {
    'CODE': 'USER_ID',
    'MAIL_CONTACT': 'USER_MAIL',
    'PHONE_CONTACT': 'USER_PHONE',
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users.rename(columns= {


In [92]:
df_users.sample()

Unnamed: 0,USER_ID,USER_MAIL,USER_PHONE
22541,49323,user-4932@mail.com,51243480


In [98]:
df_users['USER_ID'] = df_users['USER_ID'].apply(lambda x: int(x))
df_users['USER_PHONE'] = df_users['USER_PHONE'].apply(lambda x: int(x))

In [99]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   USER_ID     30000 non-null  int64 
 1   USER_MAIL   30000 non-null  object
 2   USER_PHONE  30000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 703.3+ KB


### **Properties dataframe**

In [115]:
df_properties = df_raw[[
    'PROPERTIE_ID', 'STATE', 'CITY', 'COLONY', 'STREET', 'EXTERNAL_NUM', 'CODE', 'TYPE', 'PURPOSE', 'PRICE'
]]

In [116]:
df_properties.rename(columns= {
    'CODE': 'USER_ID'
}, inplace=True)

In [117]:
df_properties.sample()

Unnamed: 0,PROPERTIE_ID,STATE,CITY,COLONY,STREET,EXTERNAL_NUM,USER_ID,TYPE,PURPOSE,PRICE
7072,9960694218,DF / CDMX,Miguel Hidalgo,Veronica Anzures,av ejercito nacional mexicano,,58167,Casa,Venta,4602000.0


In [121]:
df_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   PROPERTIE_ID  30000 non-null  int64 
 1   STATE         30000 non-null  object
 2   CITY          30000 non-null  object
 3   COLONY        30000 non-null  object
 4   STREET        30000 non-null  object
 5   EXTERNAL_NUM  30000 non-null  object
 6   USER_ID       30000 non-null  int64 
 7   TYPE          30000 non-null  object
 8   PURPOSE       30000 non-null  object
 9   PRICE         30000 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 2.3+ MB


In [119]:
df_properties['STREET'].fillna('<--->', inplace=True)
df_properties['EXTERNAL_NUM'].fillna('<--->', inplace=True)

In [120]:
df_properties['USER_ID'] = df_properties['USER_ID'].apply(lambda x: int(x))
df_properties['PRICE'] = df_properties['PRICE'].apply(lambda x: int(x))