# <center> Manipulation et traitement des données credit_immo </center>

## Importation des librairies

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Chargement des fichiers

In [2]:
# CSV
creditImmoCSV = pd.read_csv("data/credit_immo/credit_immo.csv", header=0)
type(creditImmoCSV)

pandas.core.frame.DataFrame

In [3]:
# JSON
creditImmoJSON = pd.read_json("data/credit_immo/credit_immo.json")
#creditImmoJSON

In [4]:
# XLS
creditImmoXLS = pd.read_excel("data/credit_immo/credit_immo.xls")
creditImmoXLS

Unnamed: 0,ID_NOM,Niv_Etude_Bac,age,contrat_de_travail,Salaire,dette_anterieure,etat_civile,apport,enfant_a_Charge,Solvable
0,jean,3.0,45,CDI,40000,4000,M,0.3,3.0,OUI
1,VANESSA,5.0,28,CDI,30500,1320,M,0.1,0.0,OUI
2,TARCISSE,0.0,55,CDI,28000,40000,C,0.0,0.0,NON
3,TIBAULT,4.0,23,CDD,15000,0,M,0.1,,OUI
4,GILES,0.0,33,CDD,27000,3000,C,0.1,2.0,NON
5,ETHAN,0.0,41,INTERIM,40000,1000,M,0.0,4.0,NON
6,LILIANE,-1.0,29,CDI,50000,3000,C,0.0,0.0,NON
7,GLODI,-3.0,24,INTERIM,29000,5000,C,0.0,0.0,NON
8,FLORIANE,,26,CDI,44000,0,M,0.1,1.0,OUI
9,MARIE,3.0,37,CDI,45800,2000,M,0.2,2.0,OUI


## Création de la base de donnée creditImmo

In [5]:
# Définition des noms de colonnes
caractCreditImmo = ["taux_de_ventes", "croissance_vente", "ratio_benefice", "ratio_perte"]

# Affectation des valeurs aléatoires correspondantes aux colonnes
tauxImmo1 = np.random.rand(6, 2)*100
tauxImmo2 = np.random.rand(6, 2)
tauxImmo = np.column_stack((tauxImmo1, tauxImmo2))

# Création de la base de données
creditImmo = pd.DataFrame(tauxImmo, columns = caractCreditImmo)
creditImmo

Unnamed: 0,taux_de_ventes,croissance_vente,ratio_benefice,ratio_perte
0,50.565177,95.886368,0.595175,0.205907
1,89.052717,78.627852,0.103759,0.064152
2,4.220367,27.144935,0.565081,0.951014
3,63.032417,81.795351,0.250638,0.120574
4,69.901126,38.615281,0.704059,0.706747
5,50.63685,78.932792,0.787073,0.467058


## Gestion des données numériques manquantes

In [6]:
# Insertion de nouveaux index, remplis avec des valeurs Nan
indexCreditImmo = range(10)
creditImmo = creditImmo.reindex(indexCreditImmo)
creditImmo

Unnamed: 0,taux_de_ventes,croissance_vente,ratio_benefice,ratio_perte
0,50.565177,95.886368,0.595175,0.205907
1,89.052717,78.627852,0.103759,0.064152
2,4.220367,27.144935,0.565081,0.951014
3,63.032417,81.795351,0.250638,0.120574
4,69.901126,38.615281,0.704059,0.706747
5,50.63685,78.932792,0.787073,0.467058
6,,,,
7,,,,
8,,,,
9,,,,


In [7]:
# Récupérer les indices des valeurs manquantes
creditImmo.isnull()

Unnamed: 0,taux_de_ventes,croissance_vente,ratio_benefice,ratio_perte
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,True,True,True,True
7,True,True,True,True
8,True,True,True,True
9,True,True,True,True


In [8]:
# Remplacer les valeurs Nan par 0
creditImmo.fillna(0)

Unnamed: 0,taux_de_ventes,croissance_vente,ratio_benefice,ratio_perte
0,50.565177,95.886368,0.595175,0.205907
1,89.052717,78.627852,0.103759,0.064152
2,4.220367,27.144935,0.565081,0.951014
3,63.032417,81.795351,0.250638,0.120574
4,69.901126,38.615281,0.704059,0.706747
5,50.63685,78.932792,0.787073,0.467058
6,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0


In [9]:
# Supprimer les valeurs NaN
creditImmo.dropna()

Unnamed: 0,taux_de_ventes,croissance_vente,ratio_benefice,ratio_perte
0,50.565177,95.886368,0.595175,0.205907
1,89.052717,78.627852,0.103759,0.064152
2,4.220367,27.144935,0.565081,0.951014
3,63.032417,81.795351,0.250638,0.120574
4,69.901126,38.615281,0.704059,0.706747
5,50.63685,78.932792,0.787073,0.467058


## Traitement des données

### Suppression des valeur Nan

In [10]:
# On commence par définir le type de remplacement
imp_mean = SimpleImputer(missing_values = np.nan, strategy="mean")

# On sélectionne le nom des colonnes contenant des valeurs numériques
colonnesNumeriques = creditImmoCSV.select_dtypes(exclude="object").columns

# On applique le imputer au Dataframe correspondant
imp_mean.fit(creditImmoCSV[colonnesNumeriques])

#On applique la transformation aux colonnes numériques
creditImmoCSV[colonnesNumeriques] = imp_mean.transform(creditImmoCSV[colonnesNumeriques])

creditImmoCSV

Unnamed: 0,ID_NOM,Niv_Etude_Bac,age,contrat_de_travail,Salaire,dette_anterieure,etat_civile,apport,enfant_a_Charge,Solvable
0,jean,3.0,45.0,CDI,40000.0,4000.0,M,0.3,3.0,OUI
1,VANESSA,5.0,28.0,CDI,30500.0,1320.0,M,0.1,0.0,OUI
2,TARCISSE,0.0,55.0,CDI,28000.0,40000.0,C,0.0,0.0,NON
3,TIBAULT,4.0,23.0,CDD,15000.0,0.0,M,0.1,1.333333,OUI
4,GILES,0.0,33.0,CDD,27000.0,3000.0,C,0.1,2.0,NON
5,ETHAN,0.0,41.0,INTERIM,40000.0,1000.0,M,0.0,4.0,NON
6,LILIANE,-1.0,29.0,CDI,50000.0,3000.0,C,0.0,0.0,NON
7,GLODI,-3.0,24.0,INTERIM,29000.0,5000.0,C,0.0,0.0,NON
8,FLORIANE,1.444444,26.0,CDI,44000.0,0.0,M,0.1,1.0,OUI
9,MARIE,3.0,37.0,CDI,45800.0,2000.0,M,0.2,2.0,OUI


### Encodage des valeurs catégorielles en valeurs numériques

In [11]:
creditImmoCSV = creditImmoCSV.apply(LabelEncoder().fit_transform)
creditImmoCSV

Unnamed: 0,ID_NOM,Niv_Etude_Bac,age,contrat_de_travail,Salaire,dette_anterieure,etat_civile,apport,enfant_a_Charge,Solvable
0,18,6,14,1,7,7,1,3,4,1
1,16,8,3,1,6,3,1,1,0,1
2,14,2,16,1,4,13,0,0,0,0
3,15,7,0,0,0,0,1,1,2,1
4,7,2,7,0,3,6,0,1,3,0
5,2,2,12,3,7,1,1,0,5,0
6,12,1,4,1,12,6,0,0,0,0
7,8,0,1,3,5,9,0,0,0,0
8,5,4,2,1,9,0,1,1,1,1
9,13,6,8,1,11,5,1,2,3,1


### Fractionner le jeu de données pour l’entrainement et le test

In [13]:
# On sépare les données entre les variables d'étude et la variable à évaluer
X = creditImmoCSV.iloc[:, 0:-1]
y = creditImmoCSV["Solvable"]

# On fractionne le jeu de donnée entre l'entraînement et le test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Mise à l’échelle des features

In [None]:
scaler = StandardScaler()
scaler.fit(creditImmoCSV)
scaler.transform(creditImmoCSV)
creditImmoCSV

Unnamed: 0,ID_NOM,Niv_Etude_Bac,age,contrat_de_travail,Salaire,dette_anterieure,etat_civile,apport,enfant_a_Charge,Solvable
0,18,6,14,1,7,7,1,3,4,1
1,16,8,3,1,6,3,1,1,0,1
2,14,2,16,1,4,13,0,0,0,0
3,15,7,0,0,0,0,1,1,2,1
4,7,2,7,0,3,6,0,1,3,0
5,2,2,12,3,7,1,1,0,5,0
6,12,1,4,1,12,6,0,0,0,0
7,8,0,1,3,5,9,0,0,0,0
8,5,4,2,1,9,0,1,1,1,1
9,13,6,8,1,11,5,1,2,3,1
