In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import itertools
import seaborn as sns

In [None]:
df=pd.read_parquet('data/data_clean.parquet')
df.head()

In [None]:
df=df.drop(['id_mutation','adresse_nom_voie','adresse_numero','type_local','surface_terrain'],axis=1)

In [None]:
df['date_mutation']=df['date_mutation'].astype('datetime64')
df['date_mutation']=(df['date_mutation']-df['date_mutation'].min()) / np.timedelta64(1,'D')
df.info()

In [None]:
df=df[df['code_postal'].notnull()]
df.isnull().sum()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(8,4))
sns.displot(df['valeur_fonciere'])

In [None]:
plt.figure(figsize=(8,4))
sns.displot(df['prix_m2'])

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='longitude',y='latitude',data=df,hue='valeur_fonciere')

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='longitude',y='latitude',data=df,hue='prix_m2')

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True)

In [None]:
train_valeur_fonciere_numerical=df.drop(['prix_m2','adresse_code_voie','code_postal'],axis=1)
train_valeur_fonciere_numerical.fillna(0,inplace=True)
#train_prix_m2_numerical=df.drop(['valeur_fonciere','adresse_code_voie','code_postal'],axis=1)
#train_prix_m2_numerical.fillna(0,inplace=True)
train_categoric=df.drop(['date_mutation','valeur_fonciere','surface_carrez','nombre_lots','surface_reelle_bati','nombre_pieces_principales','longitude','latitude','prix_m2'],axis=1)
train_categoric.fillna('NONE',inplace=True)

In [None]:
from sklearn.ensemble import IsolationForest
clf=IsolationForest(max_samples=10000,random_state=17)
clf.fit(train_valeur_fonciere_numerical)
y_no_outlier=clf.predict(train_valeur_fonciere_numerical)
y_no_outlier=pd.DataFrame(y_no_outlier,columns=['Top'])
index_no_outliers=y_no_outlier[y_no_outlier['Top']==1].index.values

In [None]:
train_valeur_fonciere_numerical=train_valeur_fonciere_numerical.iloc[index_no_outliers]
train_valeur_fonciere_numerical.reset_index(drop=True, inplace=True)
train_categoric=train_categoric.iloc[index_no_outliers]
train_categoric.reset_index(drop=True, inplace=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler
columns_list=list(train_valeur_fonciere_numerical.columns)
columns_list.remove('valeur_fonciere')
columns_to_normalize=np.matrix(train_valeur_fonciere_numerical.drop('valeur_fonciere',axis=1))
normalizer=MinMaxScaler()
normalizer.fit(columns_to_normalize)
normalized_columns=pd.DataFrame(normalizer.transform(columns_to_normalize), columns=columns_list)
train_valeur_fonciere_numerical[list(normalized_columns.columns)]=normalized_columns
train_valeur_fonciere_numerical.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le1=LabelEncoder()
le2=LabelEncoder()
le1.fit(train_categoric.adresse_code_voie)
le2.fit(train_categoric.code_postal)
train_categoric.code_postal=le2.transform(train_categoric.code_postal)
train_categoric.adresse_code_voie=le1.transform(train_categoric.adresse_code_voie)
train_categoric.head()

In [None]:
train=train_valeur_fonciere_numerical.merge(train_categoric, left_index=True, right_index=True)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.head()

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import Callback

X_train, X_test, Y_train, Y_test = train_test_split(train.drop('valeur_fonciere',axis=1),train['valeur_fonciere'],test_size=0.3,random_state=17)


early_stop=EarlyStopping(monitor='val_loss',mode='min',verbose=0,patience=10)
class PrintDot(Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout

model1 = Sequential()

model1.add(Dense(8,activation='relu'))
model1.add(Dropout(0.5))

model1.add(Dense(3,activation='relu'))
model1.add(Dropout(0.2))

model1.add(Dense(1))

model1.compile(optimizer='adam',loss='mse')

In [None]:
history1=model1.fit(x=X_train,y=Y_train.values,validation_data=(X_test,Y_test.values),batch_size=128,epochs=400,verbose=0,callbacks=[early_stop,PrintDot()])

In [None]:
losses1=pd.DataFrame(history1.history)
predictions1=model1.predict(X_test)
losses1.plot()

In [None]:
matplotlib.rc('xtick',labelsize=30)
matplotlib.rc('ytick',labelsize=30)

fig, ax=plt.subplots(figsize=(15,12))
plt.style.use('ggplot')
plt.plot(predictions1,Y_test.values,'ro')
plt.xlabel('Predictions',fontsize=30)
plt.ylabel('Real Price',fontsize=30)
plt.title('Predictions VS Real Price')
ax.plot([Y_test.values.min(),Y_test.max()],[Y_test.values.min(),Y_test.values.max()],'k--',lw=4)
plt.show()

In [None]:
model1.summary()

In [None]:
model2 = Sequential()

model2.add(Dense(64,activation='relu'))
model2.add(Dropout(0.5))

model2.add(Dense(32,activation='relu'))
model2.add(Dropout(0.2))

model2.add(Dense(16))

model2.compile(optimizer='adam',loss='mse')

In [None]:
early_stop=EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=10)
history2=model2.fit(x=X_train,y=Y_train.values,validation_data=(X_test,Y_test.values),batch_size=128,verbose=0,epochs=400,callbacks=[early_stop,PrintDot()])

In [None]:
losses2=pd.DataFrame(history2.history)
predictions2=model2.predict(X_test)
losses2.plot()

In [None]:
matplotlib.rc('xtick',labelsize=30)
matplotlib.rc('ytick',labelsize=30)

fig, ax=plt.subplots(figsize=(15,12))
plt.style.use('ggplot')
plt.plot(predictions2,Y_test.values,'ro')
plt.xlabel('Predictions',fontsize=30)
plt.ylabel('Real Price',fontsize=30)
plt.title('Predictions VS Real Price')
ax.plot([Y_test.values.min(),Y_test.max()],[Y_test.values.min(),Y_test.values.max()],'k--',lw=4)
plt.show()

In [None]:
model1.save('model/model_bad')
model2.save('model/model_good')