In [1]:
### Imports
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer
from sklearn.neighbors import KNeighborsRegressor

In [4]:
### Load the dataset 
df = pd.read_csv('G:\Mi unidad\Hackathon--aws\Data\df_final.csv', compression='zip')
df.head()

df_dummies2 = pd.read_csv('G:\Mi unidad\Hackathon--aws\df_dummies2.csv', compression='zip')

In [5]:
### Preprocessing -> Steps before training the ML model -> Get Dummies
df_dummies1 = pd.concat([df , df['type_local'].str.get_dummies()], 
          axis = 1)
df_final = pd.concat([df_dummies1 , df_dummies1['nature_mutation'].str.get_dummies()], 
          axis = 1)
df_final.head(4)

Unnamed: 0,id_mutation,valeur_fonciere,surface_reelle_bati,nombre_pieces_principales,type_local,nature_mutation,nom_commune,code_commune,code_departement,id_parcelle,...,date_mutation,Appartement,Local industriel. commercial ou assimilé,Maison,Adjudication,Echange,Expropriation,Vente,Vente en l'état futur d'achèvement,Vente terrain à bâtir
0,2019-1137255,310000.0,101.0,4.0,Maison,Vente,Gretz-Armainvilliers,77215,77,772150000B0473,...,2019-01-03,0,0,1,0,0,0,1,0,0
1,2019-1137256,150000.0,72.0,3.0,Maison,Vente,Saint-Fargeau-Ponthierry,77407,77,77407000AD0345,...,2019-01-03,0,0,1,0,0,0,1,0,0
2,2019-1137256,150000.0,72.0,3.0,Maison,Vente,Saint-Fargeau-Ponthierry,77407,77,77407000AD0345,...,2019-01-03,0,0,1,0,0,0,1,0,0
3,2019-1137257,250000.0,44.0,0.0,Local industriel. commercial ou assimilé,Vente,Melun,77288,77,77288000AV0266,...,2019-01-09,0,1,0,0,0,0,1,0,0


In [18]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 824305 entries, 0 to 824304
Data columns (total 22 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   id_mutation                               824305 non-null  object 
 1   valeur_fonciere                           817036 non-null  float64
 2   surface_reelle_bati                       824305 non-null  float64
 3   nombre_pieces_principales                 824305 non-null  float64
 4   type_local                                824305 non-null  object 
 5   nature_mutation                           824305 non-null  object 
 6   nom_commune                               824305 non-null  object 
 7   code_commune                              824305 non-null  int64  
 8   code_departement                          824305 non-null  int64  
 9   id_parcelle                               824305 non-null  object 
 10  longitude           

# Neighborhood

In [4]:
### Nearest Neighbor - ML model - silhouette_score
start_time = datetime.now()

# Drop NaN
dft =  df_final.dropna()

# Characteristics for the model 
characteristics = ['valeur_fonciere','surface_reelle_bati','nombre_pieces_principales','longitude','latitude',
'Appartement','Local industriel. commercial ou assimilé','Maison','Adjudication','Echange','Expropriation','Vente',"Vente en l'état futur d'achèvement",
'Vente terrain à bâtir']

# Split the data into Train and test set
df_train, df_test = train_test_split(dft, test_size=0.2, random_state=7)

# Select the model and prepare Grid Search
model = NearestNeighbors()

parameters = {'n_neighbors':[3,4,5], 
                'algorithm':['ball_tree', 'kd_tree', 'brute'], 
                'metric': ['cityblock','euclidean','manhattan']
}

# GridSearch and fit in train
grid_search = GridSearchCV(model, parameters, cv=5, scoring = silhouette_score)
grid_search.fit(df_train[characteristics])

params_to_use = grid_search.best_params_
estimator = grid_search.best_estimator_

end_time = datetime.now()

#Check results
print(f'NaN values dropped: ', df_final.shape[0]- dft.shape[0])
print(f'Best score: ', grid_search.best_score_)
print(f'Best params: ', grid_search.best_params_)
print(f'Best estimator: ', grid_search.best_estimator_)
print('\nBest params: ', params_to_use)
print(f'\nTotalTime: ',(end_time - start_time))

Traceback (most recent call last):
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\model_selection\_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 117, in silhouette_score
    return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 212, in silhouette_samples
    X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"])
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\utils\validation.py", line 1074, in check_X_y
    X = check_array(
  File "c:\Users\frans\anaconda3\envs\Data\lib\site-packages\sklearn\utils\validation.py", line 871, in check_array
    raise ValueError(
ValueError: Expected 2D array, got scalar array instead:
array=NearestNeighbors(algorithm='ball_tree', metric='cityblock

NaN values dropped:  15229
Best score:  nan
Best params:  {'algorithm': 'ball_tree', 'metric': 'cityblock', 'n_neighbors': 3}
Best estimator:  NearestNeighbors(algorithm='ball_tree', metric='cityblock', n_neighbors=3)

Best params:  {'algorithm': 'ball_tree', 'metric': 'cityblock', 'n_neighbors': 3}

TotalTime:  0:08:01.417516


In [8]:
def recommend_neighborhoods(current_neighborhood):
    distances, index1 = estimator.kneighbors(df_train[df_train['nom_commune'] == current_neighborhood][characteristics])
    similar_neighborhoods = df_train.iloc[index1.flat]['nom_commune']
    similar_neighborhoods = similar_neighborhoods[similar_neighborhoods != current_neighborhood].head(3)
    return similar_neighborhoods.values[0], similar_neighborhoods.values[1], similar_neighborhoods.values[2]
recommend_neighborhoods('Paris 14e Arrondissement')

('Montreuil', 'Paris 1er Arrondissement', 'Paris 15e Arrondissement')

In [12]:
### Main Function
# Drop NaN
dft =  df_final.dropna()

# Characteristics for the model 
characteristics = ['valeur_fonciere','surface_reelle_bati','nombre_pieces_principales','longitude','latitude',
'Appartement','Local industriel. commercial ou assimilé','Maison','Adjudication','Echange','Expropriation','Vente',"Vente en l'état futur d'achèvement",
'Vente terrain à bâtir']

# Split the data into Train and test set
df_train, df_test = train_test_split(dft, test_size=0.2, random_state=7)

# Select the model and prepare Grid Search
parameters = {'n_neighbors':3, 
                'algorithm':'ball_tree', 
                'metric': 'cityblock'
}
model = NearestNeighbors(n_neighbors = 3, algorithm = 'ball_tree', metric ='cityblock'). fit(df_train[characteristics])


def recommend_neighborhoods(current_neighborhood):
    distance, index1 = model.kneighbors(df_train[df_train['nom_commune'] == current_neighborhood][characteristics])
    similar_neighborhoods = df_train.iloc[index1.flat]['nom_commune']
    similar_neighborhoods = similar_neighborhoods[similar_neighborhoods != current_neighborhood].head(3)
    return similar_neighborhoods.values[0], similar_neighborhoods.values[1], similar_neighborhoods.values[2]
recommend_neighborhoods('Paris 14e Arrondissement')

('Montreuil', 'Paris 1er Arrondissement', 'Paris 15e Arrondissement')

# Price

In [4]:
dft =  df_final.dropna()

In [24]:
# Define target
dft1 = dft[dft['nom_commune']=='Montreuil']

X = dft1[['surface_reelle_bati','nombre_pieces_principales','Maison','Appartement']]
y = dft1['valeur_fonciere']

# Train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Model 
model01 = LinearRegression().fit(X_train,y_train)

print('mean square error: ',mean_squared_error(y_test, model01.predict(X_test)))
print('r2: ',r2_score(y_test, model01.predict(X_test)))

def prediction_price(sqm, rooms,house,apt): 
    user_input = {'surface_reelle_bati':[sqm],
                'nombre_pieces_principales': [rooms],
                'Maison':[house],
                'Appartement':[apt]}
    user_input_df = pd.DataFrame(user_input)
    price_pred = model01.predict(user_input_df)
    return price_pred

prediction_price(50,2,1,0)


-0.0786671722192962
mean square error:  142808171768392.62
r2:  -0.0786671722192962


array([-232721.93186432])

In [10]:
### Preprocesing price
df_dummies2 =  df_final[(df_final['nombre_pieces_principales'] > 1) & (df_final['nombre_pieces_principales'] < 7)]
df_dummies2 =  df_final[(df_final['surface_reelle_bati'] > 15.5) & (df_final['surface_reelle_bati'] < 162.5)]

df_dummies2 =  df_final[(df_final['price_m2'] > 261.02) & (df_final['surface_reelle_bati'] < 12437.39)] 
df_dummies2['code_commune_fact']  = pd. factorize(df_dummies2['code_commune'])[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dummies2['code_commune_fact']  = pd. factorize(df_dummies2['code_commune'] )[0]


In [2]:
### Load the dataset 
df = pd.read_csv('G:\Mi unidad\Hackathon--aws\Data\df_final.csv', compression='zip')
df_dummies2 = pd.read_csv('G:\Mi unidad\Hackathon--aws\df_dummies2.csv', compression='zip')

### Preprocessing of df for Neighborhood
df_dummies1 = pd.concat([df , df['type_local'].str.get_dummies()], 
          axis = 1)
df_final = pd.concat([df_dummies1 , df_dummies1['nature_mutation'].str.get_dummies()], 
          axis = 1)
df_final['price_m2'] = df_final['valeur_fonciere']/df_final['surface_reelle_bati']

In [11]:
### Machine Learning 

###NEIGHBORS and PRICE 
## Neighbors
# Drop NaN
dft =  df_final.dropna()
# Characteristics for the model 
characteristics = ['valeur_fonciere','surface_reelle_bati','nombre_pieces_principales','longitude','latitude',
'Appartement','Local industriel. commercial ou assimilé','Maison','Adjudication','Echange','Expropriation','Vente',"Vente en l'état futur d'achèvement",
'Vente terrain à bâtir','price_m2']
# Split the data into Train and test set
df_train, df_test = train_test_split(dft, test_size=0.2, random_state=7)
# Select the model and prepare Grid Search
parameters = {'n_neighbors':3, 
                'algorithm':'ball_tree', 
                'metric': 'cityblock'
}
model_N = NearestNeighbors(n_neighbors = 3, algorithm = 'ball_tree', metric ='cityblock'). fit(df_train[characteristics])

# Function
def recommend_neighborhoods(current_neighborhood):
    distance, index1 = model_N.kneighbors(df_train[df_train['nom_commune'] == current_neighborhood][characteristics])
    similar_neighborhoods = df_train.iloc[index1.flat]['nom_commune']
    similar_neighborhoods = similar_neighborhoods[similar_neighborhoods != current_neighborhood].head(2)
    return similar_neighborhoods

district = recommend_neighborhoods('Paris 1er Arrondissement')
print(district.values)

##Price 

#Splits 
xcolumns = ['price_m2', 'surface_reelle_bati', 'nombre_pieces_principales', 'Appartement',
'Local industriel. commercial ou assimilé', 'Maison', 'Adjudication',
'Echange', 'Expropriation', 'Vente','Vente en l\'état futur d\'achèvement', 'Vente terrain à bâtir','code_commune_fact']

#District 1
dft0 = df_dummies2[df_dummies2['nom_commune']== district.values[0]]
X0 = dft0[xcolumns]
y0 = dft0['valeur_fonciere']
X_train0, X_test0, y_train0, y_test0 = train_test_split(X0, y0, test_size=0.2, random_state=7)

#District 2
dft01 = df_dummies2[df_dummies2['nom_commune']== district.values[1]]
X1 = dft01[xcolumns]
y1 = dft01['valeur_fonciere']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=7)


def predict_house_price0(sqm, rooms, model, X_train0):
    model.fit(X_train0, y_train0)
    data = {'surface_reelle_bati': sqm, 'nombre_pieces_principales': rooms}
    df = pd.DataFrame(data, index=[0])

    df = pd.get_dummies(df)
    missing_cols = set(X_train0.columns) - set(df.columns)
    
    for c in missing_cols:
        df[c] = X_train0[c].median()
    df = df[X_train0.columns]
   
    prediction = model.predict(df)
    return prediction[0]

def predict_house_price1(sqm, rooms, model, X_train1):
    model.fit(X_train1, y_train1)
    data = {'surface_reelle_bati': sqm, 'nombre_pieces_principales': rooms}
    df = pd.DataFrame(data, index=[0])

    df = pd.get_dummies(df)
    missing_cols = set(X_train1.columns) - set(df.columns)
    
    for c in missing_cols:
        df[c] = X_train1[c].median()
    df = df[X_train1.columns]
   
    prediction = model.predict(df)
    return prediction[0]

model_lr = LinearRegression()
price0 = predict_house_price0(50, 3, model_lr, X_train0)
price1 = predict_house_price1(50, 3, model_lr, X_train1)

print(f'For {district.values[0]} the price is:', round(price0,2))
print(f'For {district.values[1]} the price is:', round(price1,2))


['Paris 18e Arrondissement' 'Paris 16e Arrondissement']
For Paris 18e Arrondissement the price is: 539941.62
For Paris 16e Arrondissement the price is: 537359.85
