# Immothep

### La société Immothep est une agence immobilière spécialisée dans le vente de biens de particuliers.

Possédant déjà un site internet, elle souhaite pouvoir intégrer à celui-ci, un module d'estimation. Elle possède les ressources nécessaires pour réaliser le code dit "front", ainsi que les ressources graphiques.

Elle ne possède cependant pas les compétences nécessaires pour la réalisation de l'API qui va permettre d'exposer ce nouveau service.

La société vous sollicite donc pour réaliser la partie API en utilisant les données Open Data des Demandes de Valeurs Foncières (DVF) sur l'année 2019.


### 0. Initialisation

In [153]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from pandas import Series
from sklearn.preprocessing import LabelEncoder
import numpy as np
import requests


In [154]:
cwd = os.path.join(os.getcwd(), '..')

DATA_IN_FOLDER = os.path.join(cwd, 'data', 'in')
DATA_CURATED_FOLDER = os.path.join(cwd, 'data', 'curated')
DATA_OUT_FOLDER = os.path.join(cwd, 'data', 'out')
DATA_REPORTS_FOLDER = os.path.join(cwd, 'data', 'reports')
DATASOURCE_PATH = r'https://www.data.gouv.fr/fr/datasets/r/3004168d-bec4-44d9-a781-ef16f41856a2'
FILE_NAME = 'valeursfoncieres-2019.txt'

### 1. Load dataset 

In [155]:
global_datasource_src = os.path.join(DATASOURCE_PATH)
global_datasource_dest = os.path.join(DATA_IN_FOLDER, FILE_NAME)

if os.path.exists(global_datasource_dest):
    print('Data already downloaded')
else :
    print('Download %s' % global_datasource_src)
    with open(global_datasource_dest, "wb") as f:
        response = requests.get(global_datasource_src, stream=True)
        f.write(response.content)
        print('Data downloaded')


Data already downloaded


In [156]:
#load data
global_data = pd.read_csv(global_datasource_dest, encoding='utf-8', sep='|', decimal=',')

### 2. Exploratory Data Analysis

In [157]:
#basic stats about data
print(global_data.describe().transpose())
# Check the number of data points in the data set
print(f'Nb records {len(global_data)}')
# Check the number of features in the data set
print(f'Nb columns {len(global_data.columns)}')
# Check the data types
print(f'Data types {global_data.dtypes.unique()}')

#count empty columns
tmp_data = global_data.dropna(axis = 1, how ='all') 
print(f'Nb empty columns {len(global_data.columns) - len(tmp_data.columns)}')
del(tmp_data)

#deep analysis
profil = ProfileReport(global_data)
#set True to force update of file
if os.path.exists(os.path.join( DATA_REPORTS_FOLDER ,'rapport_initial.html') or False): 
    print('Initial report already generated at ' + os.path.join( DATA_REPORTS_FOLDER ,'rapport_initial.html'))    
else :
    profil.to_file(output_file= os.path.join( DATA_REPORTS_FOLDER ,'rapport_initial.html'))

del(profil)

                                count           mean           std      min  \
Code service CH                   0.0            NaN           NaN      NaN   
Reference document                0.0            NaN           NaN      NaN   
1 Articles CGI                    0.0            NaN           NaN      NaN   
2 Articles CGI                    0.0            NaN           NaN      NaN   
3 Articles CGI                    0.0            NaN           NaN      NaN   
4 Articles CGI                    0.0            NaN           NaN      NaN   
5 Articles CGI                    0.0            NaN           NaN      NaN   
No disposition              2535791.0       1.199535  7.401537e+00     1.00   
Valeur fonciere             2506530.0  971597.040251  7.856914e+06     0.01   
No voie                     1510153.0     727.027501  2.076794e+03     1.00   
Code postal                 2507468.0   51976.369473  2.730261e+04  1000.00   
Code commune                2535791.0     208.996930

### 3. Global cleanup of the dataset

In [158]:
#clean up data - remove empty columns
cleared_data = global_data.dropna(axis = 1, how ='all') 
print(f'New number of column {len(cleared_data.columns)}/{len(global_data.columns)}')

#remove duplicates rows
cleared_data.drop_duplicates(inplace=True)
print(f'New number of row {len(cleared_data)}/{len(global_data)}')


New number of column 35/43
New number of row 2439667/2535791


#### > Create property referential (house, appartment, ground, ...)

In [159]:
#create property referential
global_property_type = cleared_data[['Code type local','Type local']]
global_property_type.drop_duplicates(inplace=True)
global_property_type.dropna(inplace=True)

#add 'Autre' property type
global_property_type.loc[len(global_property_type)] = [len(global_property_type)+1,'Autre']

global_property_type.sort_values(by=['Code type local'], inplace=True)
global_property_type.set_index('Code type local')
global_property_type['Code type local'] = global_property_type['Code type local'].astype(int)

global_property_type.to_csv(os.path.join(DATA_OUT_FOLDER, 'property_type_referential.csv'), index=False)


In [160]:
# drop sells with more than one lot
cleared_data.drop(cleared_data.loc[cleared_data["Nombre de lots"]> 1].index, inplace=True)
cleared_data.groupby("Nombre de lots")[['Valeur fonciere']].count().sort_values("Nombre de lots")

Unnamed: 0_level_0,Valeur fonciere
Nombre de lots,Unnamed: 1_level_1
0,1636095
1,615030


In [161]:
#drop unused columns
cleared_data.drop(columns=['No disposition','Date mutation','No voie','B/T/Q','Type de voie','Code voie','Voie','Prefixe de section','Section','No plan','No Volume','1er lot','Surface Carrez du 1er lot','2eme lot','Surface Carrez du 2eme lot','3eme lot','Surface Carrez du 3eme lot','4eme lot','Surface Carrez du 4eme lot','5eme lot','Surface Carrez du 5eme lot','Nombre de lots','Commune', 'Type local'], inplace=True, errors='ignore')

#drop rows with empty sales costs
cleared_data.dropna(subset = ['Valeur fonciere'], inplace = True) 

print(f'New number of column {len(cleared_data.columns)}/{len(global_data.columns)}')
print(f'New number of records {len(cleared_data)}/{len(global_data)}')

New number of column 11/43
New number of records 2251125/2535791


#### > from additionnal referential, create our own to get gps coordinates and insee code

In [162]:
if os.path.exists(os.path.join(DATA_OUT_FOLDER, 'coord_gps_referential.csv')):
 print('gps coordinates referential already exists at '+ os.path.join(DATA_IN_FOLDER, 'correspondance-code-insee-code-postal.csv'))
else :
    #load insee referential
    cp = pd.read_csv(os.path.join(DATA_IN_FOLDER, 'correspondance-code-insee-code-postal.csv'), encoding='utf-8', sep=';', usecols=['Code Commune', 'Code Département', 'Code Postal', 'geo_point_2d', 'Code INSEE'])

    # Create two lists for the loop results to be placed
    lat = []
    lon = []

    # For each row in a varible,
    for row in cp['geo_point_2d']:
        lat.append(row.split(',')[0])
        lon.append(row.split(',')[1])
    
    # Create two new columns from lat and lon
    cp['Latitude'] = lat
    cp['Longitude'] = lon

    #split multiple cp in one row to n rows
    df = cp['Code Postal'].str.split('/').apply(Series, 1).stack()
    df.index = df.index.droplevel(-1) # to line up with df's index
    df.name = 'Code Postal'
    del(cp['Code Postal'])
    cp = cp.join(df)
    
    cp.to_csv(os.path.join(DATA_OUT_FOLDER, 'coord_gps_referential.csv'), sep=';', index=False, columns= ['Code INSEE', 'Code Postal', 'Code Commune', 'Code Département', 'Latitude', 'Longitude'])
    
    del(lat, lon)    
    del(cp)
    del(df)


gps coordinates referential already exists at c:\prairie\projet8-v2\Immothep\src\..\data\in\correspondance-code-insee-code-postal.csv


#### > Fill-up missing postal code then add gps coordinates to sells

In [163]:
cp = pd.read_csv(os.path.join(DATA_OUT_FOLDER, 'coord_gps_referential.csv'), encoding='utf-8', sep=';')
cp['Code Commune'] = cp['Code Commune'].astype(int)
cp['Code Département'] = cp['Code Département'].astype(str)
cleared_data['Code commune'] = cleared_data['Code commune'].astype(int)
cleared_data['Code departement'] = cleared_data['Code departement'].astype(str)

cleared_data = pd.merge(cleared_data, cp, how='inner', left_on=['Code commune','Code departement'], right_on = ['Code Commune','Code Département'])

cleared_data.drop(columns=['Code INSEE', 'Code postal', 'Code departement', 'Code commune','Code Commune', 'Code Département', 'Code Postal'], inplace=True)


In [164]:
#Clear data without transaction type
counter = len(cleared_data)
cleared_data.drop(cleared_data[(pd.isna(cleared_data['Code type local'])) & (pd.isna(cleared_data['Nature culture']))].index, inplace=True)
print(f'Cleaned rows : {counter - len(cleared_data)}')

#set 'Autre' value to empty 'Code type local'
other_type = int(global_property_type[global_property_type['Type local'] == 'Autre']['Code type local'])

cleared_data['Code type local'] = cleared_data['Code type local'].fillna(other_type)
print(cleared_data['Code type local'])

cleared_data.drop(columns=['Nature culture', 'Nature culture speciale'], inplace=True, errors='ignore')

Cleaned rows : 228953
0          1.0
1          1.0
2          1.0
3          5.0
4          5.0
          ... 
2407036    4.0
2407037    4.0
2407038    4.0
2407039    4.0
2407040    2.0
Name: Code type local, Length: 2178088, dtype: float64


In [165]:
#complete missing data with 0
cleared_data[['Surface reelle bati','Nombre pieces principales', 'Surface terrain']] = cleared_data[['Surface reelle bati','Nombre pieces principales', 'Surface terrain']].fillna(0)


In [166]:

#force data type
cleared_data['Code type local'] = cleared_data['Code type local'].astype(int)
cleared_data['Surface reelle bati'] = cleared_data['Surface reelle bati'].astype(int)
cleared_data['Nombre pieces principales'] = cleared_data['Nombre pieces principales'].astype(int)
cleared_data['Surface terrain'] = cleared_data['Surface terrain'].astype(int)
cleared_data['Latitude'] = cleared_data['Latitude'].astype(float)
cleared_data['Longitude'] = cleared_data['Longitude'].astype(float)

#reorder columns
cleared_data = cleared_data[['Nature mutation', 'Code type local', 'Valeur fonciere', 'Latitude', 'Longitude', 'Surface reelle bati', 'Nombre pieces principales', 'Surface terrain']]
 
#save data by property type
for type in global_property_type.values:
    property_by_type = cleared_data[cleared_data['Code type local'] == type[0]]
    print(type[1] + ' - ' + str(len(property_by_type)))
    #print(f'nb {global_property_type['type local']} is {len(property_by_type)}')
    filename = str.format(f'{type[1]}_valeursfoncieres.csv')
    property_by_type.to_csv(os.path.join(DATA_CURATED_FOLDER, filename), sep=';', index=False )
    del(property_by_type)


Maison - 513516
Appartement - 306925
Dépendance - 346028
Local industriel. commercial ou assimilé - 90393
Autre - 921226


### 5. Global cleanup - post analysis

In [167]:
#deep analysis
profil = ProfileReport(cleared_data)
#set True to force update of file
if os.path.exists(os.path.join( DATA_REPORTS_FOLDER ,'rapport_final.html') or False):
    print('Final report already generated at ' + os.path.join( DATA_REPORTS_FOLDER ,'rapport_final.html'))   
else :
    profil.to_file(output_file= os.path.join( DATA_REPORTS_FOLDER ,'rapport_final.html'))

del(profil)

Final report already generated at c:\prairie\projet8-v2\Immothep\src\..\data\reports\rapport_final.html
