# SOMMAIRE:
* [Importations](#import)
* [Baseline](#base)
* [Iterations](#ite)


## Importations <a class="anchor" id="import"></a>

In [2]:
# Import pandas

import pandas as pd

In [3]:
# Import numpy

import numpy as np

In [4]:
# Import sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import learning_curve

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    confusion_matrix, classification_report
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, plot_roc_curve, roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve,plot_precision_recall_curve

In [6]:
# Import iterations

immo_base = pd.read_csv('train_data.csv')
df_immobilier_no_null = pd.read_csv('df_immobilier_no_null.csv')
df_no_outliers = pd.read_csv('df_no_outliers.csv')

# Iterations <a class="anchor" id="ite"></a>

## Iterations immo_base : dataset inchangé 
<a class="anchor" id="ite_immo_base"></a>

### Dummy Classifier sur immo_base : score = 0.44
<a class="anchor" id="base"></a>

In [23]:
# Let's start building a dummy model using DummyClassifier with strategy "most_frequent". And calculate the score
# To do so, use all features in X and Outcome as target (y)
# define X, y

y = immo_base["ocean_proximity"]
X = immo_base.drop("ocean_proximity", axis=1)

In [24]:
# instantiate Dummy classifier

dummy_clf = DummyClassifier(strategy="most_frequent")

In [25]:
# fit the model

dummy_clf.fit(X, y)

In [26]:
# calculate the score

dummy_clf.score(X, y)

0.4428294573643411

### LinearRegression on  immo_base : score = nothing

In [14]:
immo_base

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...
16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


In [16]:
# on utilise seulement nos variables explicatives

features = [
 'longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity']

In [17]:
# On choisi notre target Y et on place nos features sur X
X = immo_base[features]
Y = immo_base['median_house_value']

# base d'apprentissage et base de test

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(13209, 9)
(3303, 9)
(13209,)
(3303,)


In [15]:
# Entrainement du modèle
 
#lmodellineaire = LinearRegression()
#lmodellineaire.fit(X_train, Y_train)

# l'entrainement ne fonctionne pas car la colonne "ocean_proximity" est en string

### Hold out method with immo_base :  score = 0.43
<a class="anchor" id="hold"></a>

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(13209, 9)
(3303, 9)
(13209,)
(3303,)


In [19]:
#Fit the dummy model with the train and score with the test

# Splitting the data, use random_state = 1 for the split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.2, 
                                                    random_state=1)

In [20]:
# using DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

In [21]:
# fit the model

dummy_clf.fit(X_train, y_train)

In [22]:
# calculate the score

dummy_clf.score(X_test, y_test)

0.43808658795034816

## Iterations df_no_outliers : dataset without outliers 
<a class="anchor" id="ite_df_no_outliers"></a>

In [8]:
### Dummy Classifier on df_no_outliers : score = 

In [10]:
df_no_outliers

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,total_residents,households,median_income,median_house_value,ocean_proximity
0,0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...,...
14083,16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
14084,16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
14085,16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
14086,16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


In [11]:
# Let's start building a dummy model using DummyClassifier with strategy "most_frequent". And calculate the score
# To do so, use all features in X and Outcome as target (y)
# define X, y

y = df_no_outliers["median_house_value"]
X = df_no_outliers.drop("median_house_value", axis=1)

In [12]:
# instantiate Dummy classifier

dummy_clf = DummyClassifier(strategy="most_frequent")

In [13]:
# fit the model

dummy_clf.fit(X, y)

In [14]:
# calculate the score

dummy_clf.score(X, y)

0.006459398069278819

## Iterations immo_base : df_immobilier_no_null : null suprimé et ocean_proximity en binaire
<a class="anchor" id="ite_immo_base"></a>

### LinearRegression on df_immobilier_no_null : score = 0.65, rmse = 68663, r2 = 0.64

In [23]:
df_immobilier_no_null

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,total_residents,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,0.0,1.0,0.0,0.0,0.0
1,1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,1.0,0.0,0.0,0.0,0.0
2,2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,0.0,1.0,0.0,0.0,0.0
3,3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,1.0,0.0,0.0,0.0,0.0
4,4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16331,16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,0.0,1.0,0.0,0.0,0.0
16332,16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,0.0,0.0,0.0,1.0,0.0
16333,16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,0.0,1.0,0.0,0.0,0.0
16334,16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,1.0,0.0,0.0,0.0,0.0


In [24]:
# on utilise seulement nos variables explicatives

features = [
 'longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'total_residents',
 'households',
 'median_income',
 'ocean_proximity_<1H OCEAN',
 'ocean_proximity_INLAND',
 'ocean_proximity_ISLAND',
 'ocean_proximity_NEAR BAY',
 'ocean_proximity_NEAR OCEAN']

In [25]:
# On choisi notre target Y et on place nos features sur X
X = df_immobilier_no_null[features]
Y = df_immobilier_no_null['median_house_value']

# base d'apprentissage et base de test, separation données test et train 

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(13068, 13)
(3268, 13)
(13068,)
(3268,)


In [26]:
# Choose your model
model = LinearRegression()

# Fit the model with the train set
model.fit(X_train, Y_train)

# Evaluate the model with the test set
baseline_score = model.score(X_test, Y_test)
baseline_score

0.6519937753249814

In [33]:
#Model plus lisible mais non fonctionnel en etat

# Evaluation du training set, r2


y_train_predict = model.predict(X_train)
rmse = np.sqrt(mean_squared_error(Y_train, y_train_predict))
r2 = r2_score(Y_train, y_train_predict)
 
print('La performance du modèle sur la base dapprentissage')
print('--------------------------------------')
print('Lerreur quadratique moyenne est {}'.format(rmse))
print('le score R2 est {}'.format(r2))
print('\n')
 
# model evaluation for testing set

y_test_predict = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)
 
print('La performance du modèle sur la base de test')
print('--------------------------------------')
print('Lerreur quadratique moyenne est {}'.format(rmse))
print('le score R2 est {}'.format(r2))

La performance du modèle sur la base dapprentissage
--------------------------------------
Lerreur quadratique moyenne est 68663.44392539366
le score R2 est 0.6463946596593321


La performance du modèle sur la base de test
--------------------------------------
Lerreur quadratique moyenne est 67482.94882137455
le score R2 est 0.6519937753249814
