In [1]:
# Se cargan librerias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Se leen los archivos parquet y se asignan a un dataframe
df_train = pd.read_parquet("./Datasets/train.parquet")
df_test = pd.read_parquet("./Datasets/test.parquet")

In [3]:
# Visualizacion general del dataframe
df_train.head(3)

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state
0,7048013474,https://boise.craigslist.org/apa/d/very-nice-b...,boise,https://boise.craigslist.org,1350,house,1200,2,2.0,1,...,0,0,0,w/d in unit,detached garage,https://images.craigslist.org/00B0B_cPiJMEheZe...,Super cute row house in the Boise bench area. ...,43.5851,-116.225,id
1,7043931179,https://cosprings.craigslist.org/apa/d/colorad...,colorado springs,https://cosprings.craigslist.org,1115,apartment,694,1,1.0,1,...,0,0,0,w/d in unit,carport,https://images.craigslist.org/00R0R_5XAoSKvfrz...,Windtree Apartment Homes currently has a spaci...,38.9137,-104.78,co
2,7048254516,https://norfolk.craigslist.org/apa/d/virginia-...,norfolk / hampton roads,https://norfolk.craigslist.org,1129,apartment,900,2,2.0,0,...,0,0,0,w/d hookups,off-street parking,https://images.craigslist.org/00f0f_3ZbTFrsHpZ...,Call Today! show contact info Indian Lakes ...,36.7922,-76.1643,va


In [4]:
# Crear la columna category_price en ambos conjuntos de datos
df_train["category_price"] = 0
df_train.loc[df_train["price"] <= 999, "category_price"] = 1
df_train.loc[(df_train["price"] > 999) & (df_train["price"] <= 1999), "category_price"] = 0
df_train.loc[df_train["price"] > 1999, "category_price"] = 0

# Dividir los datos en variables predictivas y objetivo
X_train = df_train.drop(["price","category_price"], axis=1)
y_train = df_train["category_price"]


In [15]:
# Se buscan duplicados
df_train[df_train.duplicated()]

Unnamed: 0,id,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,state,category_price


In [5]:
# Se borran columnas que un primer analisis muy general no afectarian el resultado
df_train.drop(["url"], axis=1, inplace=True)
df_train.drop(["region_url"], axis=1, inplace=True)
df_train.drop(["image_url"], axis=1, inplace=True)
df_train.drop(["lat"], axis=1, inplace=True)
df_train.drop(["long"], axis=1, inplace=True)
df_train.drop(["description"], axis=1, inplace=True)

In [6]:
# Se convierte en numericas columnas categorias
df_train["type"] = df_train["type"].factorize()[0]
df_train["parking_options"] = df_train["parking_options"].factorize()[0]
df_train["laundry_options"] = df_train["laundry_options"].factorize()[0]
df_train["region"] = df_train["region"].factorize()[0]
df_train["state"] = df_train["state"].factorize()[0]

In [7]:
df_train.head(2)

Unnamed: 0,id,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,state,category_price
0,7048013474,0,1350,0,1200,2,2.0,1,1,1,0,0,0,0,0,0,0
1,7043931179,1,1115,1,694,1,1.0,1,1,1,0,0,0,0,1,1,0


In [9]:
# Se buscan valores nulos
missing_values_count = df_train.isnull().sum()
print(missing_values_count)


id                         0
region                     0
price                      0
type                       0
sqfeet                     0
beds                       0
baths                      0
cats_allowed               0
dogs_allowed               0
smoking_allowed            0
wheelchair_access          0
electric_vehicle_charge    0
comes_furnished            0
laundry_options            0
parking_options            0
state                      0
category_price             0
dtype: int64


In [10]:
# Dividir los datos en variables predictivas y objetivo

X = df_train.drop(columns=["category_price", "price"])
y = df_train["category_price"]

# Dividir los datos de entrenamiento en un conjunto de entrenamiento y uno de validación

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [11]:
# Entrenar el modelo de clasificación con aprendizaje supervisado en el conjunto de entrenamiento
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Hacer predicciones con el modelo entrenado en el conjunto de validación
y_pred = model.predict(X_test)

In [12]:
# Calcular la precisión en el conjunto de validación
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)


0.9342241976448857


In [13]:
# Guardar las predicciones en un archivo formato csv
predictions = pd.DataFrame(y_pred, columns=['pred'])
predictions.to_csv("GyGuillermo.csv", index=False)

Pruebas con otro modelos

In [14]:
# Entrenar el modelo de clasificación con aprendizaje supervisado
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train, y_train)

# Hacer predicciones con el modelo entrenado
y_pred = clf.predict(X_test)

# calcular la precisión
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(acc * 100))



Accuracy: 53.71%
