In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("teste_indicium_precificacao.csv")

In [None]:
df.head()

# Análise exploratória dos dados

O dataframe possui algumas linhas em branco que serão tratadas mais à frente

In [None]:
df.info()

In [None]:
df.describe()

### Vê-se que disponibilidade_365 e calculado_host_listings_count possuem maior correlação com o valor

In [None]:
df.corr()['price'].sort_values()

In [None]:
grouped = df[['bairro_group','price','reviews_por_mes']].groupby('bairro_group').mean().sort_values('price',ascending= False )
grouped.head()

In [None]:
bairro = df[['bairro','price','reviews_por_mes']].groupby('bairro').mean().sort_values('price',ascending= False )
bairro.head()

In [None]:
bairro = df[['bairro','price','reviews_por_mes']].groupby('bairro').mean().sort_values('reviews_por_mes',ascending= False )
bairro.tail()

In [None]:
sns.pairplot(bairro)

In [None]:
bairro.info()

In [None]:
bairro['valor'] = bairro['price']*bairro['reviews_por_mes']

## Supondo que uma pessoa esteja pensando em investir em um apartamento para alugar na plataforma, onde seria mais indicada a compra?

Avaliando o preço por noite e considerando que o numero de reviewsd condiz com o número de noites alugadas podemos supor que
Prince's Bay é local mais lucrativo para aluguéis.

In [None]:
bairro = bairro.groupby('bairro').mean().sort_values('valor',ascending= False )
bairro.head()

## O número mínimo de noites e a disponibilidade ao longo do ano interferem no preço?
Mínimo de noites e disponibilidade interferem menos de 10% no preço final:

In [None]:
df[['price','disponibilidade_365', 'minimo_noites']].corr()

Tratando dados faltantes

In [None]:
def percent_missing(df):
    percent_nan = 100 * df.isnull().sum() / len(df)
    percent_nan = percent_nan[percent_nan > 0].sort_values()

    return percent_nan

In [None]:
percent_nan = percent_missing(df)
percent_nan

In [None]:
sns.barplot(x=percent_nan.index,y=percent_nan)
plt.xticks(rotation=90)
plt.show()

In [None]:
df[df['nome'].isnull()]

In [None]:
df = df.dropna(axis=0,subset=['nome','host_name'])

In [None]:
percent_nan = percent_missing(df)
percent_nan

In [None]:
df[df['ultima_review'].isnull()]

In [None]:
df['ultima_review'] = df['ultima_review'].fillna("1970-01-01")
df['reviews_por_mes'] = df['reviews_por_mes'].fillna(0)

In [None]:
percent_nan = percent_missing(df)
percent_nan

In [None]:
df[df['price']==0].info()

Excluindo entradas onde o valor é zero

In [None]:
df = df[df['price'] != 0]

In [None]:
df[df['price']==0].info()

## Existe algum padrão no texto do nome do local para lugares de mais alto valor?

In [None]:
df_texto = df[['nome','price']].sort_values('price',ascending= False )
df_texto

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

In [None]:
df_texto['nome'] = df_texto['nome'].apply(preprocess_text)
df_texto['nome']=df_texto['nome'].str.lower()

In [None]:
pip install wordcloud

## No geral temos as seguintes palavras com destaque:

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


text = ' '.join(df_texto['nome'])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
df_texto_top500 = df_texto.head(500)

## Foi selecionado os 500 locais de maior valor, assim temos as seguintes palavras com maior destaque:

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = ' '.join(df_texto_top500['nome'])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Explique como você faria a previsão do preço a partir dos dados. Quais variáveis e/ou suas transformações você utilizou e por quê? Qual tipo de problema estamos resolvendo (regressão, classificação)? Qual modelo melhor se aproxima dos dados e quais seus prós e contras? Qual medida de performance do modelo foi escolhida e por quê?

Nas partes acima ja foi feita a correção de dados faltantes, abaixo os dados serão manipulados para o treinamento do modelo de regressão que possibilitará fazer uma previsão dos preços de acordo com as caracteristicas. 

## Ajustando valores numéricos e de texto

In [None]:
df['ultima_review'] = pd.to_datetime(df['ultima_review']).astype(np.int64)

In [85]:
object_df = df.select_dtypes(include='object')

In [86]:
object_df.columns

Index(['nome', 'host_name', 'bairro_group', 'bairro', 'room_type'], dtype='object')

In [87]:
object_df = object_df.drop( ['nome','host_name'],axis = 1)

In [88]:
numeric_df = df.select_dtypes(exclude='object')

In [89]:
df_objects_dummies = pd.get_dummies(object_df,drop_first=True)

In [90]:
final_df = pd.concat([numeric_df,df_objects_dummies], axis=1)

In [91]:
final_df.head()

Unnamed: 0,id,host_id,latitude,longitude,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,...,bairro_Williamsbridge,bairro_Williamsburg,bairro_Willowbrook,bairro_Windsor Terrace,bairro_Woodhaven,bairro_Woodlawn,bairro_Woodrow,bairro_Woodside,room_type_Private room,room_type_Shared room
0,2595,2845,40.75362,-73.98377,225,1,45,1558396800000000000,0.38,2,...,0,0,0,0,0,0,0,0,0,0
1,3647,4632,40.80902,-73.9419,150,3,0,0,0.0,1,...,0,0,0,0,0,0,0,0,1,0
2,3831,4869,40.68514,-73.95976,89,1,270,1562284800000000000,4.64,1,...,0,0,0,0,0,0,0,0,0,0
3,5022,7192,40.79851,-73.94399,80,10,9,1542585600000000000,0.1,1,...,0,0,0,0,0,0,0,0,0,0
4,5099,7322,40.74767,-73.975,200,3,74,1561161600000000000,0.59,1,...,0,0,0,0,0,0,0,0,0,0


In [92]:
corr = final_df.corr()['ultima_review'].abs().sort_values()

In [93]:
corr.tail(20)

bairro_Tribeca                   0.026207
bairro_Battery Park City         0.026222
latitude                         0.030187
bairro_Financial District        0.033215
bairro_Murray Hill               0.037862
bairro_Bedford-Stuyvesant        0.037937
disponibilidade_365              0.045402
bairro_group_Brooklyn            0.048107
longitude                        0.048253
bairro_Theater District          0.053175
bairro_group_Manhattan           0.061260
bairro_Midtown                   0.070048
host_id                          0.072675
price                            0.085691
minimo_noites                    0.114278
calculado_host_listings_count    0.116241
id                               0.146588
numero_de_reviews                0.279234
reviews_por_mes                  0.368100
ultima_review                    1.000000
Name: ultima_review, dtype: float64

## O dataframe preparado para as predições:

In [94]:
final_df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,...,bairro_Williamsbridge,bairro_Williamsburg,bairro_Willowbrook,bairro_Windsor Terrace,bairro_Woodhaven,bairro_Woodlawn,bairro_Woodrow,bairro_Woodside,room_type_Private room,room_type_Shared room
count,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,...,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0,48846.0
mean,19023390.0,67635100.0,40.728947,-73.952176,152.774782,7.012488,23.270913,1.222505e+18,1.091032,7.14904,...,0.000819,0.08017,2e-05,0.003214,0.001802,0.000225,2e-05,0.004811,0.456496,0.023687
std,10983880.0,78629800.0,0.054529,0.046162,240.250956,20.021549,44.551056,6.22491e+17,1.597211,32.968608,...,0.028605,0.271559,0.004525,0.056603,0.042407,0.015005,0.004525,0.069195,0.498109,0.152073
min,2595.0,2438.0,40.49979,-74.24442,10.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9475226.0,7816403.0,40.69009,-73.98308,69.0,1.0,1.0,1.45895e+18,0.04,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,19684220.0,30791330.0,40.72308,-73.95569,106.0,3.0,5.0,1.546474e+18,0.37,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,29158400.0,107434400.0,40.76311,-73.936293,175.0,5.0,24.0,1.560902e+18,1.58,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,1.562544e+18,58.5,327.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Predições

Podemos eliminar a coluna 'ultima review' pela correlaçao com preço ser baixa e com pouca interpretabilidade pelo modelo. Colunas como 'nome' e 'host' também não acrescentam na capacidade de previsão do modelo

In [95]:
X = final_df.drop('price',axis=1)
X = final_df.drop(['ultima_review'],axis=1)
y = final_df.price

In [96]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [97]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_Scaled = scaler.fit(X_train)
X_train_Scaled = scaler.transform(X_train)
X_test_Scaled = scaler.transform(X_test)

### A seguir são testados modelos variados para verificar sua performance inicial. Utiliza-se o MAE pela fácil interpretabilidade do seu resultado em relação à preços e pela robustez em relação à outliers, como imóveis com valores muito altos ou baixos.

In [98]:
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import SVR
from sklearn.ensemble import HistGradientBoostingRegressor,GradientBoostingRegressor,AdaBoostRegressor,RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings('ignore')

models = []
models.append(('RF', RandomForestRegressor()))
models.append(('Ada', AdaBoostRegressor(DecisionTreeRegressor())))
models.append(('SVR', SVR()))
models.append(('HXG', HistGradientBoostingRegressor()))
models.append(('XG', GradientBoostingRegressor()))
models.append(('MLP', MLPRegressor(max_iter=1000)))
models.append(('KNN', KNeighborsRegressor(n_neighbors=1)))

num_folds=5
results = []
names = []
kfold = KFold(n_splits=num_folds,random_state=0,shuffle=True)

for name, model in models:

    cv_results = cross_val_score(model, X_train_Scaled, y_train, cv=kfold, scoring='neg_mean_absolute_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s accuracy:  %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

RF accuracy:  -0.297029 (0.172031)
Ada accuracy:  -0.187790 (0.129547)
SVR accuracy:  -54.786999 (1.759687)
HXG accuracy:  -7.656749 (2.002133)
XG accuracy:  -0.961291 (0.104296)
MLP accuracy:  -2.272451 (0.531302)
KNN accuracy:  -36.281570 (0.569616)


### Adaboost foi o modelo que previu com menor MAE, assim seguiremos com a sua implementação.

In [99]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("teste_indicium_precificacao.csv")

df['ultima_review'] = pd.to_datetime(df['ultima_review'])

df = df.dropna(axis=0, subset=['nome', 'host_name'])
df['ultima_review'] = df['ultima_review'].fillna(pd.to_datetime("1970-01-01"))
df['reviews_por_mes'] = df['reviews_por_mes'].fillna(0)
df = df[df['price'] != 0]

object_df = df.select_dtypes(include='object').drop(['nome', 'host_name'], axis=1)
df_objects_dummies = pd.get_dummies(object_df, drop_first=True)

numeric_df = df.select_dtypes(exclude='object')
final_df = pd.concat([numeric_df, df_objects_dummies], axis=1)

X = final_df.drop(['price', 'ultima_review'], axis=1)
y = final_df['price']

model = AdaBoostRegressor(DecisionTreeRegressor())
model.fit(X, y)

joblib.dump(model, 'model.pkl')

# ajustando os dados do apartamento a ser previsto:
sample = {
    'id': 2595,
    'host_id': 2845,
    'bairro_group': 'Manhattan',
    'bairro': 'Midtown',
    'latitude': 40.75362,
    'longitude': -73.98377,
    'room_type': 'Entire home/apt',
    'minimo_noites': 1,
    'numero_de_reviews': 45,
    'ultima_review': pd.to_datetime('2019-05-21'),
    'reviews_por_mes': 0.38,
    'calculado_host_listings_count': 2,
    'disponibilidade_365': 355
}

sample_df = pd.DataFrame([sample])

sample_df_objects_dummies = pd.get_dummies(sample_df, drop_first=True)

expected_columns = set(X.columns)
current_columns = set(sample_df_objects_dummies.columns)
missing_columns = list(expected_columns - current_columns)
for col in missing_columns:
    sample_df_objects_dummies[col] = 0

sample_df_objects_dummies = sample_df_objects_dummies[X.columns]

## Qual seria a sua sugestão de preço?

In [100]:
predicted_price = model.predict(sample_df_objects_dummies)

print("Predicted price:", predicted_price[0])

Predicted price: 225.0
