In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [2]:
# Read the parquet file
df_train_d = pd.read_parquet("./Datasets/train.parquet")
df_test_d = pd.read_parquet("./Datasets/test.parquet")

In [3]:
df_train = df_train_d
df_test = df_test_d

In [4]:
# Crear la columna category_price en ambos conjuntos de datos
df_train["category_price"] = ""
df_train.loc[df_train["price"] <= 999, "category_price"] = "low"
df_train.loc[(df_train["price"] > 999) & (df_train["price"] <= 1999), "category_price"] = "medium"
df_train.loc[df_train["price"] > 1999, "category_price"] = "high"

# Dividir los datos en variables predictivas y objetivo
X_train = df_train.drop(["price","category_price"], axis=1)
y_train = df_train["category_price"]


In [5]:
df_train["category_price"] = df_train["category_price"].replace({"low": 0, "medium": 1, "high": 2})

df_train["type"] = df_train["type"].factorize()[0]
df_train["parking_options"] = df_train["parking_options"].factorize()[0]
df_train["laundry_options"] = df_train["laundry_options"].factorize()[0]
df_train["region"] = df_train["region"].factorize()[0]
df_train["state"] = df_train["state"].factorize()[0]


df_test["type"] = df_test["type"].factorize()[0]
df_test["parking_options"] = df_test["parking_options"].factorize()[0]
df_test["laundry_options"] = df_test["laundry_options"].factorize()[0]
df_test["region"] = df_test["region"].factorize()[0]
df_test["state"] = df_test["state"].factorize()[0]

In [6]:
# Revisar los tipos de datos y cuantas filas y columnas hay
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346479 entries, 0 to 346478
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       346479 non-null  int64  
 1   url                      346479 non-null  object 
 2   region                   346479 non-null  int64  
 3   region_url               346479 non-null  object 
 4   price                    346479 non-null  int64  
 5   type                     346479 non-null  int64  
 6   sqfeet                   346479 non-null  int64  
 7   beds                     346479 non-null  int64  
 8   baths                    346479 non-null  float64
 9   cats_allowed             346479 non-null  int64  
 10  dogs_allowed             346479 non-null  int64  
 11  smoking_allowed          346479 non-null  int64  
 12  wheelchair_access        346479 non-null  int64  
 13  electric_vehicle_charge  346479 non-null  int64  
 14  come

In [7]:
# Estadísticas descriptivas del dataset
df_train.describe()

Unnamed: 0,id,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,lat,long,state,category_price
count,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,346479.0,344757.0,344757.0,346479.0,346479.0
mean,7040988000.0,122.798424,9664.42,1.257736,1066.326,1.903189,1.480339,0.726803,0.707861,0.731738,0.08197,0.012792,0.048141,0.615677,0.928258,37.234363,-92.705415,19.376037,0.611425
std,8802214.0,83.170694,4703573.0,1.266559,20185.37,3.162685,0.608246,0.445602,0.454746,0.443055,0.27432,0.112374,0.214065,1.276577,1.665073,5.550956,16.551071,12.496871,0.622726
min,7003808000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-43.5333,-163.894,0.0,0.0
25%,7035988000.0,53.0,805.0,1.0,750.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,33.4531,-100.784,8.0,0.0
50%,7043325000.0,110.0,1036.0,1.0,949.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,37.6501,-87.7108,18.0,1.0
75%,7048431000.0,179.0,1395.0,1.0,1150.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,41.1379,-81.1746,29.0,1.0
max,7051292000.0,403.0,2768307000.0,11.0,8388607.0,1100.0,75.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,6.0,102.036,172.633,50.0,2.0


In [8]:
df_train.drop(["url"], axis=1, inplace=True)
df_train.drop(["region_url"], axis=1, inplace=True)
df_train.drop(["image_url"], axis=1, inplace=True)
df_train.drop(["lat"], axis=1, inplace=True)
df_train.drop(["long"], axis=1, inplace=True)
df_train.drop(["description"], axis=1, inplace=True)

In [9]:
df_test.drop(["url"], axis=1, inplace=True)
df_test.drop(["region_url"], axis=1, inplace=True)
df_test.drop(["image_url"], axis=1, inplace=True)
df_test.drop(["lat"], axis=1, inplace=True)
df_test.drop(["long"], axis=1, inplace=True)
df_test.drop(["description"], axis=1, inplace=True)

In [10]:
df_train[df_train.duplicated()]

Unnamed: 0,id,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,state,category_price


In [11]:
missing_values_count = df_train.isnull().sum()
print(missing_values_count)


id                         0
region                     0
price                      0
type                       0
sqfeet                     0
beds                       0
baths                      0
cats_allowed               0
dogs_allowed               0
smoking_allowed            0
wheelchair_access          0
electric_vehicle_charge    0
comes_furnished            0
laundry_options            0
parking_options            0
state                      0
category_price             0
dtype: int64


In [12]:
print('Porcentaje de valores faltantes de la variable laundry_options:', 
(df_train.laundry_options.isnull().sum()/len(df_train)))

Porcentaje de valores faltantes de la variable laundry_options: 0.0


In [13]:
print('Porcentaje de valores faltantes de la variable parking_options:', 
(df_train.parking_options.isnull().sum()/len(df_train)))

Porcentaje de valores faltantes de la variable parking_options: 0.0


In [14]:
from sklearn.model_selection import train_test_split

X = df_train.drop(columns=["category_price", "price"])
y = df_train["category_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [16]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)


0.9075704225352113


In [17]:
category_dict = {0: 'low', 1: 'medium', 2: 'high'}

predictions = pd.DataFrame(y_pred, columns=['pred'])
predictions['pred'] = predictions['pred'].map(category_dict)
predictions.to_csv("GyGuillermo.csv", index=False)

In [None]:
df_train.head(3)

In [None]:
df_test.head(3)

In [39]:
print(df_train.shape,df_test.shape)


(346479, 17) (38498, 15)


In [22]:
# Entrenar el modelo de clasificación con aprendizaje supervisado
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train, y_train)

# Hacer predicciones con el modelo entrenado
y_pred = clf.predict(X_test)

# calcular la precisión
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(acc * 100))



Accuracy: 46.43%


In [None]:
# Eliminar las características con valores nulos
#X_test = X_test.dropna(axis=1)

# Entrenar el modelo de clustering no supervisado
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_test)


In [None]:
#Evaluar el rendimiento del modelo de clustering utilizando la métrica Silhouette Score
silhouette = silhouette_score(X_test, kmeans.labels_)
print("Silhouette Score: {:.2f}".format(silhouette))



In [None]:
#Generar un archivo .csv con las predicciones del modelo de clasificación supervisado
predictions = pd.DataFrame(y_pred, columns=['pred'])
predictions.to_csv("supervised_predictions.csv", index=False)

#Generar un archivo .csv con las predicciones del modelo de clustering no supervisado
# 
cluster_predictions = pd.DataFrame(kmeans.labels_, columns=['pred'])
cluster_predictions.to_csv("unsupervised_predictions.csv", index=False)