Referential analysis

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
#import IPython


In [2]:
DATA_IN_FOLDER = 'C:/prairie/projet8/data/in/'
DATA_CLEANED_FOLDER = 'C:/prairie/projet8/data/cleaned/'
DATA_OUT_FOLDER = 'C:/prairie/projet8/data/out/'

In [3]:
#load data
global_data = pd.read_csv(os.path.join(DATA_IN_FOLDER, 'valeursfoncieres-2019.txt'), encoding='utf-8', sep='|', decimal=',')


In [4]:
#basic stats about data
print(global_data.describe().transpose())
# Check the number of data points in the data set
print(f'Nb records {len(global_data)}')
# Check the number of features in the data set
print(f'Nb columns {len(global_data.columns)}')
# Check the data types
print(f'Data types {global_data.dtypes.unique()}')

#count empty columns
tmp_data = global_data.dropna(axis = 1, how ='all') 
print(f'Nb empty columns {len(global_data.columns) - len(tmp_data.columns)}')
del(tmp_data)

#deep analysis
profil = ProfileReport(global_data)
#profil.to_file(output_file='rapport.html')
del(profil)

                                count           mean           std      min  \
Code service CH                   0.0            NaN           NaN      NaN   
Reference document                0.0            NaN           NaN      NaN   
1 Articles CGI                    0.0            NaN           NaN      NaN   
2 Articles CGI                    0.0            NaN           NaN      NaN   
3 Articles CGI                    0.0            NaN           NaN      NaN   
4 Articles CGI                    0.0            NaN           NaN      NaN   
5 Articles CGI                    0.0            NaN           NaN      NaN   
No disposition              2535791.0       1.199535  7.401537e+00     1.00   
Valeur fonciere             2506530.0  971597.040251  7.856914e+06     0.01   
No voie                     1510153.0     727.027501  2.076794e+03     1.00   
Code postal                 2507468.0   51976.369473  2.730261e+04  1000.00   
Code commune                2535791.0     208.996930

In [5]:
#clean up data - remove empty columns
cleared_data = global_data.dropna(axis = 1, how ='all') 
print(f'New number of column {len(cleared_data.columns)}/{len(global_data.columns)}')

#remove duplicates rows
cleared_data.drop_duplicates(inplace=True)
print(f'New number of row {len(cleared_data)}/{len(global_data)}')


New number of column 35/43
New number of row 2439667/2535791


In [6]:
#create property referential
global_property_type = cleared_data[['Code type local','Type local']]
global_property_type.drop_duplicates(inplace=True)
global_property_type.dropna(inplace=True)

#add 'Autre' property type
global_property_type.loc[len(global_property_type)] = [len(global_property_type)+1,'Autre']

global_property_type.sort_values(by=['Code type local'], inplace=True)
global_property_type.set_index('Code type local')
global_property_type.to_csv(os.path.join(DATA_OUT_FOLDER, 'property_type_referential.csv'), index=False)
#cleared_data.drop(columns='Type local', inplace=True, errors='ignore')

In [7]:
#drop unused columns
cleared_data.drop(columns=['No disposition','Date mutation','No voie','B/T/Q','Type de voie','Code voie','Voie','Prefixe de section','Section','No plan','No Volume','1er lot','Surface Carrez du 1er lot','2eme lot','Surface Carrez du 2eme lot','3eme lot','Surface Carrez du 3eme lot','4eme lot','Surface Carrez du 4eme lot','5eme lot','Surface Carrez du 5eme lot','Nombre de lots','Commune', 'Type local'], inplace=True, errors='ignore')
#,'Code Canton','Code Arrondissement','Code Département','Code Région'

#drop rows with empty sales costs
cleared_data.dropna(subset = ['Valeur fonciere'], inplace = True) 

print(f'New number of column {len(cleared_data.columns)}/{len(global_data.columns)}')
print(f'New number of records {len(cleared_data)}/{len(global_data)}')

New number of column 11/43
New number of records 2412834/2535791


In [8]:
#load insee referential
cp = pd.read_csv(os.path.join(DATA_IN_FOLDER, 'correspondance-code-insee-code-postal.csv'), encoding='utf-8', sep=';', usecols=['Code Commune', 'Code Département', 'Code Postal'])

counter = cleared_data['Code postal'].isnull().sum()
print(f'Number of Postal Code with NaN value before: {counter}')

cleared_data['Code postal'] = cleared_data.apply(lambda x: cp[(cp['Code Commune']==x['Code commune']) & (cp['Code Département']==str(x['Code departement']))]['Code Postal'] if pd.isna(x['Code postal']) else x['Code postal'], axis=1)

counter = cleared_data['Code postal'].isnull().sum()
print(f'Number of Postal Code with NaN value after: {counter}')

cleared_data.drop(columns=['Code commune', 'Code departement'], inplace= True, errors='ignore')
del(cp)
del(counter)

Number of Postal Code with NaN value before: 28157
Number of Postal Code with NaN value after: 0


In [9]:
#Clear data without transaction type
counter = len(cleared_data)
cleared_data.drop(cleared_data[(pd.isna(cleared_data['Code type local'])) & (pd.isna(cleared_data['Nature culture']))].index, inplace=True)
print(f'Cleaned rows : {counter - len(cleared_data)}')

#set 'Autre' value to empty 'Code type local'
other_type = global_property_type[global_property_type['Type local'] == 'Autre']['Code type local']
#cleared_data['Code type local'] = cleared_data['Code type local'].replace(pd.NA, 0)

cleared_data['Code type local'] = cleared_data.apply(lambda x: other_type if pd.isna(x['Code type local']) else x['Code type local'], axis=1)

print(cleared_data['Code type local'])

Cleaned rows : 169043
0          2
1          2
2          3
3          1
4          1
          ..
2535785    2
2535786    2
2535788    2
2535789    4
2535790    2
Name: Code type local, Length: 2243791, dtype: object


In [10]:
'''
#encode 'Nature culture' to number and create a referential for re-use in prediction tool
global_culture_type = cleared_data['Nature culture']
global_culture_type.drop_duplicates(inplace=True)
global_culture_type.dropna(inplace=True)
global_culture_type = global_culture_type.append(pd.Series(['NONE']))
global_culture_type.reset_index(drop=True, inplace=True)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(global_culture_type)
global_culture_type = pd.concat([global_culture_type, pd.Series(integer_encoded.reshape(-1))], axis=1)
global_culture_type = global_culture_type.rename(columns={0: "Nature culture", 1: "Nature culture encoded"})

#global_culture_type.set_index('Nature culture', inplace=True)
#global_culture_type.to_csv(os.path.join(DATA_OUT_FOLDER, 'culture_type_referential.csv'), sep=';', index=False)
#cleared_data['Nature culture'] = cleared_data['Nature culture'].replace('NaN', 'NONE')
#cleared_data['Nature culture'].fillna('NONE', inplace=True)

value = cleared_data.merge(global_culture_type, how='left', on='Nature culture')


#merge
#value = cleared_data.merge(global_culture_type, on='Nature culture', how='left')
value.to_csv(os.path.join(DATA_OUT_FOLDER, 'toto.csv'), sep=';', index=False)

print(value)
#print(cleared_data)
#cleared_data.to_csv(os.path.join(DATA_OUT_FOLDER, 'toto.csv'), sep=';', index=False)
#cleared_data['Nature culture encoded'] = cleared_data.apply(lambda x: global_culture_type[global_culture_type['Nature culture']==x['Nature culture']]['Nature culture encoded'], axis=1)
#print(cleared_data.head())
'''

cleared_data.drop(columns=['Nature culture', 'Nature culture speciale'], inplace=True, errors='ignore')


In [16]:

cleared_data[['Surface reelle bati','Nombre pieces principales', 'Surface terrain']] = cleared_data[['Surface reelle bati','Nombre pieces principales', 'Surface terrain']].fillna(0)


In [12]:
#deep analysis
profil = ProfileReport(cleared_data)
profil.to_file(output_file='rapport_final.html')
cleared_data.to_csv(os.path.join(DATA_CLEANED_FOLDER, 'cleaned_valeursfoncieres.csv'), sep=';')
del(profil)

Summarize dataset: 100%|██████████| 22/22 [01:14<00:00,  3.38s/it, Completed]
Generate report structure: 100%|██████████| 1/1 [00:08<00:00,  8.30s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 30.32it/s]
