## **Librerías**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_columns = False

In [3]:
from datetime import datetime, timedelta
import winsound as cst

In [4]:
import sys
sys.path.append('./')
sys.path.append('../')

In [5]:
import xml.etree.ElementTree as ET

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
from Utils.functions import generate_numeric_id

## **Datos**

In [8]:
path = '../Data/Raw/feed.xml'

In [9]:
# Cargar el archivo XML
tree = ET.parse(path)
root = tree.getroot()

In [10]:
data = []

for listing in root.findall("listing"):
    data.append({
        "state": listing.find("state").text,
        "city": listing.find("city").text,
        "colony": listing.find("colony").text,
        "street": listing.find("street").text,
        "external_num": listing.find("external_num").text,
        "code": listing.find("code").text,
        "type": listing.find("type").text,
        "purpose": listing.find("purpose").text,
        "price": float(listing.find("price").text),  # Convertir a número
        "mail_contact": listing.find("mail_contact").text,
        "phone_contact": listing.find("phone_contact").text
    })

In [11]:
df_raw = pd.DataFrame(data)

In [12]:
df_raw.to_csv('../Data/Processed/Habi_BBDD.csv', index=False, encoding='utf-8', sep='|')

## **Limpieza**

In [14]:
df_raw.columns = [i.upper() for i in df_raw.columns]

In [15]:
df_raw = generate_numeric_id(
    df_raw, 
    ['STATE', 'CITY', 'COLONY', 'STREET', 'CODE', 'PRICE', 'PHONE_CONTACT'], 
    'PROPERTIE_ID'
)

### **Users dataframe**

In [16]:
df_users = df_raw[['CODE', 'MAIL_CONTACT', 'PHONE_CONTACT']]

In [17]:
df_users.rename(columns= {
    'CODE': 'USER_ID',
    'MAIL_CONTACT': 'USER_MAIL',
    'PHONE_CONTACT': 'USER_PHONE',
}, inplace=True)

In [18]:
df_users.sample()

Unnamed: 0,USER_ID,USER_MAIL,USER_PHONE
23715,2007,user-200@mail.com,29834512


In [19]:
df_users['USER_ID'] = df_users['USER_ID'].apply(lambda x: int(x))
df_users['USER_PHONE'] = df_users['USER_PHONE'].apply(lambda x: int(x))

In [23]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   USER_ID     30000 non-null  int64 
 1   USER_MAIL   30000 non-null  object
 2   USER_PHONE  30000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 703.3+ KB


In [24]:
df_users.to_csv('../Data/Final/Users_df.csv', index=False, sep='|')

### **Properties dataframe**

In [25]:
df_properties = df_raw[[
    'PROPERTIE_ID', 'STATE', 'CITY', 'COLONY', 'STREET', 'EXTERNAL_NUM', 'CODE', 'TYPE', 'PURPOSE', 'PRICE'
]]

In [26]:
df_properties.rename(columns= {
    'CODE': 'USER_ID'
}, inplace=True)

In [27]:
df_properties.sample()

Unnamed: 0,PROPERTIE_ID,STATE,CITY,COLONY,STREET,EXTERNAL_NUM,USER_ID,TYPE,PURPOSE,PRICE
9545,2360000994,DF / CDMX,Cuauhtémoc,Roma Norte,,,11533,Casa,Venta,7590777.0


In [28]:
df_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PROPERTIE_ID  30000 non-null  int64  
 1   STATE         30000 non-null  object 
 2   CITY          30000 non-null  object 
 3   COLONY        30000 non-null  object 
 4   STREET        26661 non-null  object 
 5   EXTERNAL_NUM  9344 non-null   object 
 6   USER_ID       30000 non-null  object 
 7   TYPE          30000 non-null  object 
 8   PURPOSE       30000 non-null  object 
 9   PRICE         30000 non-null  float64
dtypes: float64(1), int64(1), object(8)
memory usage: 2.3+ MB


In [29]:
df_properties['STREET'].fillna('<--->', inplace=True)
df_properties['EXTERNAL_NUM'].fillna('<--->', inplace=True)

In [30]:
df_properties['USER_ID'] = df_properties['USER_ID'].apply(lambda x: int(x))
df_properties['PRICE'] = df_properties['PRICE'].apply(lambda x: int(x))

In [31]:
df_properties.to_csv('../Data/Final/Properties_df.csv', index=False, sep='|')