In [25]:
### Imports
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from datetime import datetime

In [26]:
### Load the dataset 
df = pd.read_csv('G:\Mi unidad\Hackathon--aws\Data\df_final', compression='zip')
df.head()

Unnamed: 0,id_mutation,valeur_fonciere,surface_reelle_bati,nombre_pieces_principales,type_local,nature_mutation,nom_commune,code_commune,code_departement,id_parcelle,longitude,latitude,date_mutation
0,2019-1137255,310000.0,101.0,4.0,Maison,Vente,Gretz-Armainvilliers,77215,77,772150000B0473,2.740574,48.739669,2019-01-03
1,2019-1137256,150000.0,72.0,3.0,Maison,Vente,Saint-Fargeau-Ponthierry,77407,77,77407000AD0345,2.539824,48.563586,2019-01-03
2,2019-1137256,150000.0,72.0,3.0,Maison,Vente,Saint-Fargeau-Ponthierry,77407,77,77407000AD0345,2.539824,48.563586,2019-01-03
3,2019-1137257,250000.0,44.0,0.0,Local industriel. commercial ou assimilé,Vente,Melun,77288,77,77288000AV0266,2.656724,48.534489,2019-01-09
4,2019-1137258,155000.0,51.0,2.0,Appartement,Vente,Pontault-Combault,77373,77,77373000AD0157,2.61684,48.805639,2019-01-04


In [27]:
### Preprocessing -> Steps before training the ML model -> Get Dummies

df_dummies1 = pd.concat([df , df['type_local'].str.get_dummies()], 
          axis = 1)

df_final = pd.concat([df_dummies1 , df_dummies1['nature_mutation'].str.get_dummies()], 
          axis = 1)
df_final

Unnamed: 0,id_mutation,valeur_fonciere,surface_reelle_bati,nombre_pieces_principales,type_local,nature_mutation,nom_commune,code_commune,code_departement,id_parcelle,...,date_mutation,Appartement,Local industriel. commercial ou assimilé,Maison,Adjudication,Echange,Expropriation,Vente,Vente en l'état futur d'achèvement,Vente terrain à bâtir
0,2019-1137255,310000.0,101.0,4.0,Maison,Vente,Gretz-Armainvilliers,77215,77,772150000B0473,...,2019-01-03,0,0,1,0,0,0,1,0,0
1,2019-1137256,150000.0,72.0,3.0,Maison,Vente,Saint-Fargeau-Ponthierry,77407,77,77407000AD0345,...,2019-01-03,0,0,1,0,0,0,1,0,0
2,2019-1137256,150000.0,72.0,3.0,Maison,Vente,Saint-Fargeau-Ponthierry,77407,77,77407000AD0345,...,2019-01-03,0,0,1,0,0,0,1,0,0
3,2019-1137257,250000.0,44.0,0.0,Local industriel. commercial ou assimilé,Vente,Melun,77288,77,77288000AV0266,...,2019-01-09,0,1,0,0,0,0,1,0,0
4,2019-1137258,155000.0,51.0,2.0,Appartement,Vente,Pontault-Combault,77373,77,77373000AD0157,...,2019-01-04,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706657,2022-537268,1905000.0,160.0,6.0,Appartement,Adjudication,Paris 15e Arrondissement,75115,75,75115000CW0050,...,2022-04-14,1,0,0,1,0,0,0,0,0
706658,2022-537269,270000.0,51.0,0.0,Local industriel. commercial ou assimilé,Vente,Paris 11e Arrondissement,75111,75,75111000CN0034,...,2022-06-10,0,1,0,0,0,0,1,0,0
706659,2022-537270,1326666.6,281.0,0.0,Local industriel. commercial ou assimilé,Vente,Paris 14e Arrondissement,75114,75,75114000CQ0137,...,2022-06-16,0,1,0,0,0,0,1,0,0
706660,2022-537271,423000.0,43.0,2.0,Appartement,Vente,Paris 15e Arrondissement,75115,75,75115000AC0055,...,2022-06-21,1,0,0,0,0,0,1,0,0


In [31]:
### Nearest Neighbor - ML model
start_time = datetime.now()

# Drop NaN
dft = df_final.dropna()

# Characteristics for the model 
characteristics = ['valeur_fonciere','surface_reelle_bati','nombre_pieces_principales','code_commune','code_departement','longitude','latitude',
'Appartement','Local industriel. commercial ou assimilé','Maison','Adjudication','Echange','Expropriation','Vente',"Vente en l'état futur d'achèvement",
'Vente terrain à bâtir']

# Split the data into Train and test set
df_train, df_test = train_test_split(dft, test_size=0.2, random_state=7)

# Select the model and prepare Grid Search
model = NearestNeighbors()

parameters = {'n_neighbors':[3,4,5], 
                'algorithm':['ball_tree', 'kd_tree', 'brute'], 
                'metric': ['cityblock','euclidean','manhattan']
}

# GridSearch and fit in train
grid_search = GridSearchCV(model, parameters, cv=5, scoring = 'r2')
grid_search.fit(df_train[characteristics])

params_to_use = grid_search.best_params_
estimator = grid_search.best_estimator_

end_time = datetime.now()

#Check results
print(f'NaN values dropped: ', df_final.shape[0]- dft.shape[0])
print(f'Best score: ', grid_search.best_score_)
print(f'Best params: ', grid_search.best_params_)
print(f'Best estimator: ', grid_search.best_estimator_)
print('\nBest params: ', params_to_use)
print(f'\nTotalTime: ',(end_time - start_time))


KeyboardInterrupt: 

In [None]:

### Nearest Neighbor - ML model - silhouette_score
start_time = datetime.now()

# Drop NaN
dft =  df_final.dropna()

# Characteristics for the model 
characteristics = ['valeur_fonciere','surface_reelle_bati','nombre_pieces_principales','code_commune','code_departement','longitude','latitude',
'Appartement','Local industriel. commercial ou assimilé','Maison','Adjudication','Echange','Expropriation','Vente',"Vente en l'état futur d'achèvement",
'Vente terrain à bâtir']

# Split the data into Train and test set
df_train, df_test = train_test_split(dft, test_size=0.2, random_state=7)

# Select the model and prepare Grid Search
model = NearestNeighbors()

parameters = {'n_neighbors':[3,4,5], 
                'algorithm':['ball_tree', 'kd_tree', 'brute'], 
                'metric': ['cityblock','euclidean','manhattan']
}

# GridSearch and fit in train
grid_search = GridSearchCV(model, parameters, cv=5, scoring = silhouette_score)
grid_search.fit(df_train[characteristics])

params_to_use = grid_search.best_params_
estimator = grid_search.best_estimator_

end_time = datetime.now()

#Check results
print(f'NaN values dropped: ', df_final.shape[0]- dft.shape[0])
print(f'Best score: ', grid_search.best_score_)
print(f'Best params: ', grid_search.best_params_)
print(f'Best estimator: ', grid_search.best_estimator_)
print('\nBest params: ', params_to_use)
print(f'\nTotalTime: ',(end_time - start_time))

Traceback (most recent call last):
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\model_selection\_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 117, in silhouette_score
    return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 212, in silhouette_samples
    X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"])
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\utils\validation.py", line 1074, in check_X_y
    X = check_array(
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\utils\validation.py", line 871, in check_array
    raise ValueError(
ValueError: Expected 2D array, got scalar array instead:
array=NearestNeighbors(algorithm='ball_tree', metric='cityblock

KeyboardInterrupt: 

In [None]:
params_to_use

{'algorithm': 'ball_tree', 'metric': 'cityblock', 'n_neighbors': 2}

In [None]:
# ball_tree 'cityblock', 2

def final_recommendation(x):
    d, i = 

In [None]:
# def recommend_neighborhoods(current_neighborhood):
#     distances, indices = grid_search.best_estimator_.kneighbors(df_train[df_train['Neighborhood'] == current_neighborhood][features])
#     similar_neighborhoods = df_train.iloc[indices.flatten()]['Neighborhood']
#     similar_neighborhoods = similar_neighborhoods[similar_neighborhoods != current_neighborhood].head(3)
#     return similar_neighborhoods