# Projet ML: Estimation du coût d'un bien immobilier

# Itération 1

In [None]:
#Importation des bibliothèques utiles et chargement du jeu Sberbank
import pandas as pd
import numpy as np
# read the data
df = pd.read_csv('sberbank.csv')
df.head()

## Prétraitement des données

### Traitement des données manquantes

In [None]:
#code ici
#remplcer les valeurs null dans les colones non nummériques par la valeur la plus fréquente

df_categoric = df.select_dtypes(exclude='number')
categoric_cols = df_categoric.columns.values

for col in categoric_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:  # only do the imputation for the columns that have missing values.
        print(col)
        top = df[col].describe()['top'] # impute with the most frequent value.
        
        df[col] = df[col].fillna(top)
        


#remplcer les valeurs null dans les colones nummériques par la valeur la mediane
df_numeric = df.select_dtypes(include=[np.number])

numeric_cols = df_numeric.columns.values

for col in numeric_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:  # only do the imputation for the columns that have missing values.
        med = df[col].median()
        df[col] = df[col].fillna(med)

        df.head()

### Traitement des données aberrantes

In [None]:
# decrire la méthodologie adoptée pour faire le traitement
#code ici
#Utilisation de la règle interquartile pour trouver des valeurs aberrantes
df.boxplot(column='price_doc')
Q1=df.max_floor.quantile(0.25)
Q3=df.max_floor.quantile(0.75)
Q=df.max_floor.describe()
IQR=Q3-Q1
print(IQR)
print(Q1)
print(Q3)
ind=df[(df['price_doc'] < Q1-1.5*IQR )].index
ind2=df[(df['price_doc'] > Q3+1.5*IQR )].index
#df3[ind,'max_floor']=Q1
#| (df3['price_doc'] > Q3+1.5*IQR  )].index| (df3['price_doc'] > Q3+1.5*IQR  )].index
print(df.shape)
print(len(ind2))
print(len(ind))
df.loc[ind,'price_doc']=Q1
df.loc[ind2,'price_doc']=Q3

df.head()

### Traitement du problème d'incohérence des données 

In [None]:
# decrire la méthodologie adoptée pour faire le traitement
#code ici
#supression des espaces blancs au debut
df["sub_area"] = df["sub_area"].str.lstrip()
df["sub_area"]
#supression des espaces blancs à la fin
df["sub_area"] = df["sub_area"].str.rstrip()
print(df["sub_area"])
#supression des espaces entre les mots

df['sub_area'] = df['sub_area'].str.replace(' ','')
print(df["sub_area"])

#formater les dates
df['timestamp'] = pd.to_datetime(df['timestamp'],format='%Y-%m-%d')

#supression des symboles monétaires
money_chars = ["$","€","¢","£","₿"]
for column in df.columns:
    df[column]= df[column].replace('money_chars','')
    
    df.head()

### Traitement du problème de redondances des données 

In [None]:
#code ici
##df2.drop('price_doc', axis=1).drop_duplicates()

# drop rows with a lot of missing values.
ind_missing = df[df['price_doc'] > 35].index
df = df.drop(ind_missing, axis=0)
print(df.shape)

df.head()

### Transformation des données 

In [None]:
#code ici
DFCat=df[categoric_cols]
DFNum=df[numeric_cols]
DFCat.head()
print(DFCat.shape)
DFCat.head()
DFCat['culture_objects_top_25']=DFCat['culture_objects_top_25'].replace({"no":0,
 "yes":1})
DFCat.replace({"no":0,"yes":1},inplace=True)
from sklearn import preprocessing
DFCat=df[categoric_cols]
label_encoder = preprocessing.LabelEncoder()
DFCat['culture_objects_top_25']= label_encoder.fit_transform(DFCat['culture_objects_top_25'])


df = df.drop(columns='product_type')
df.head()

### Normalisation des données 

In [None]:
#code ici
from sklearn.model_selection import train_test_split
Data2=df[numeric_cols]
df_y=Data2[['price_doc']]
df_x=Data2.drop(columns=['price_doc'])

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state = 4 )

print(x_train.shape)
print(x_test.shape)

scaler = preprocessing.StandardScaler()
scaler.fit(x_train) 
xtrainnorm=scaler.transform(x_train)           
xtestnorm=scaler.transform(x_test)           

print(xtrainnorm.mean())
print(xtrainnorm.std())

df.head()

## Entrainement et optimisation des paramètres d'un modèle 

In [None]:
#code ici
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.neighbors import KNeighborsClassifier
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
ytrain_trans = kbins.fit_transform(y_train)
ytest_trans = kbins.transform(y_test)

model = KNeighborsClassifier(n_neighbors=3)

model.fit(x_train, ytrain_trans)

print(model.score(x_train, ytrain_trans)*100,'%')
print(model.score(x_test, ytest_trans)*100,'%')

## Evaluation d'un modèle

In [None]:
#code ici
# Decision tree Classfication

from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from numpy import mean
from numpy import std

clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,max_depth=5, min_samples_leaf=20)
  
    # Performing training
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3,random_state = 10)
n_scores = cross_val_score(clf_gini, x_train, ytrain_trans, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report pipeline performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
# KNN Classification
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.neighbors import KNeighborsClassifier
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
ytrain_trans = kbins.fit_transform(y_train)
ytest_trans = kbins.transform(y_test)

model = KNeighborsClassifier(n_neighbors=3)

model.fit(x_train, ytrain_trans)

print(model.score(x_train, ytrain_trans)*100,'%')
print(model.score(x_test, ytest_trans)*100,'%')