In [150]:
from IPython.display import Image
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_boston
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, LeavePOut, ShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import learning_curve, validation_curve
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")

In [151]:
data = pd.read_csv("data/houses_to_rent_v2.csv")
data

Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$)
0,São Paulo,70,2,1,1,7,acept,furnished,2065,3300,211,42,5618
1,São Paulo,320,4,4,0,20,acept,not furnished,1200,4960,1750,63,7973
2,Porto Alegre,80,1,1,1,6,acept,not furnished,1000,2800,0,41,3841
3,Porto Alegre,51,2,1,0,2,acept,not furnished,270,1112,22,17,1421
4,São Paulo,25,1,1,0,1,not acept,not furnished,0,800,25,11,836
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10687,Porto Alegre,63,2,1,1,5,not acept,furnished,402,1478,24,22,1926
10688,São Paulo,285,4,4,4,17,acept,not furnished,3100,15000,973,191,19260
10689,Rio de Janeiro,70,3,3,0,8,not acept,furnished,980,6000,332,78,7390
10690,Rio de Janeiro,120,2,2,2,8,acept,furnished,1585,12000,279,155,14020


In [152]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10692 entries, 0 to 10691
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   city                 10692 non-null  object
 1   area                 10692 non-null  int64 
 2   rooms                10692 non-null  int64 
 3   bathroom             10692 non-null  int64 
 4   parking spaces       10692 non-null  int64 
 5   floor                10692 non-null  object
 6   animal               10692 non-null  object
 7   furniture            10692 non-null  object
 8   hoa (R$)             10692 non-null  int64 
 9   rent amount (R$)     10692 non-null  int64 
 10  property tax (R$)    10692 non-null  int64 
 11  fire insurance (R$)  10692 non-null  int64 
 12  total (R$)           10692 non-null  int64 
dtypes: int64(9), object(4)
memory usage: 1.1+ MB


In [153]:
data.floor = data.floor.replace('-', '0')
data.floor

0         7
1        20
2         6
3         2
4         1
         ..
10687     5
10688    17
10689     8
10690     8
10691     0
Name: floor, Length: 10692, dtype: object

In [154]:
data.loc[data.animal != 'acept', 'animal'] = 0
data.loc[data.animal == 'acept', 'animal'] = 1
data.animal

0        1
1        1
2        1
3        1
4        0
        ..
10687    0
10688    1
10689    0
10690    1
10691    1
Name: animal, Length: 10692, dtype: object

In [155]:
data.loc[data.furniture != 'furnished', 'furniture'] = 0
data.loc[data.furniture == 'furnished', 'furniture'] = 1
data.furniture

0        1
1        0
2        0
3        0
4        0
        ..
10687    1
10688    0
10689    1
10690    1
10691    0
Name: furniture, Length: 10692, dtype: object

In [156]:
data.isnull().sum()

city                   0
area                   0
rooms                  0
bathroom               0
parking spaces         0
floor                  0
animal                 0
furniture              0
hoa (R$)               0
rent amount (R$)       0
property tax (R$)      0
fire insurance (R$)    0
total (R$)             0
dtype: int64

In [157]:
data.shape

(10692, 13)

In [158]:
target = data.iloc[:, -1]
new_data = data.iloc[:, 1: -1]

new_data.shape, target.shape

((10692, 11), (10692,))

In [159]:
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(
    new_data, target, test_size=0.5, random_state=1
)

In [137]:
data_X_train.shape, data_X_test.shape, data_y_train.shape, data_y_test.shape

((5346, 11), (5346, 11), (5346,), (5346,))

In [160]:
cl1_1 = KNeighborsClassifier(n_neighbors=50)
cl1_1.fit(data_X_train, data_y_train)
target1_0 = cl1_1.predict(data_X_train)
target1_1 = cl1_1.predict(data_X_test)
accuracy_score(data_y_train, target1_0), accuracy_score(data_y_test, target1_1)

(0.04526748971193416, 0.021885521885521887)

In [147]:
cl1_1 = KNeighborsClassifier(n_neighbors=2)
cl1_1.fit(data_X_train, data_y_train)
target1_1 = cl1_1.predict(data_X_test)
len(target1_1), target1_1

(5346, array([1937, 9467, 5724, ..., 8917, 3046, 2526]))

In [148]:
cl1_2 = KNeighborsClassifier(n_neighbors=10)
cl1_2.fit(data_X_train, data_y_train)
target1_2 = cl1_2.predict(data_X_test)
len(target1_2), target1_2

(5346, array([1867, 8913, 5564, ..., 8483, 3046, 2505]))

In [149]:
accuracy_score(data_y_test, target1_1), accuracy_score(data_y_test, target1_2)

(0.05723905723905724, 0.03198653198653199)