# SOMMAIRE:
* [Importations](#import)
* [Baseline](#base)
* [Iterations](#ite)


## Importations <a class="anchor" id="import"></a>

In [2]:
# Import pandas

import pandas as pd

In [3]:
# Import sklearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import learning_curve

from sklearn.linear_model import LinearRegression

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    confusion_matrix, classification_report
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score, plot_roc_curve, roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve,plot_precision_recall_curve

In [4]:
# Import iterations

immo_base = pd.read_csv('train_data.csv')
df_immobilier_no_null =  pd.read_csv('df_immobilier_no_null.csv')
df_immobilier_null_mean =  pd.read_csv('df_immobilier_null_mean.csv')
df_immobilier_null_median =  pd.read_csv('df_immobilier_null_median.csv')
df_no_outliers = pd.read_csv('df_no_outliers.csv')

## Baseline test <a class="anchor" id="base"></a>

In [None]:
#On affiche la description du jeu de données

df_immobilier.DESCR.split("\n")

In [None]:
#on utilise seulement 4 variables explicatives

X=pd.DataFrame(np.c_[df_immobilier['ocean'],df_immobilier['house'],df_immobilier['TAX'],df_immobilier['PTRATIO']], columns = ['LSTAT','RM','TAX','PTRATIO'])
Y = df_immobilier['sales']

#base d'apprentissage et base de test

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
#entrainement du modèle
 
lmodellineaire = LinearRegression()
lmodellineaire.fit(X_train, Y_train)

In [None]:
# Evaluation du training set, r2

y_train_predict = lmodellineaire.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)
 
print('La performance du modèle sur la base dapprentissage')
print('--------------------------------------')
print('Lerreur quadratique moyenne est {}'.format(rmse))
print('le score R2 est {}'.format(r2))
print('\n')
 
# model evaluation for testing set

y_test_predict = lmodellineaire.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)
 
print('La performance du modèle sur la base de test')
print('--------------------------------------')
print('Lerreur quadratique moyenne est {}'.format(rmse))
print('le score R2 est {}'.format(r2))

## Baseline <a class="anchor" id="base"></a>

In [None]:
# Let's start building a dummy model using DummyClassifier with strategy "most_frequent". And calculate the score
# To do so, use all features in X and Outcome as target (y)
# define X, y

y = diabetes_df["Ocean"]
X = diabetes_df.drop("", axis=1)

In [None]:
# instantiate Dummy classifier

dummy_clf = DummyClassifier(strategy="most_frequent")

In [None]:
# fit the modem

dummy_clf.fit(X, y)

In [None]:
# calculate the score

dummy_clf.score(X, y)

## Hold out method <a class="anchor" id="hold"></a>
split the date in 70% train and 30% test.

In [None]:
#Fit the dummy model with the train and score with the test

# Splitting the data, use random_state = 1 for the split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1)

In [None]:
# using DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

In [None]:
# fit the model

dummy_clf.fit(X_train, y_train)

In [None]:
# calculate the score

dummy_clf.score(X_test, y_test)

## Iterations <a class="anchor" id="ite"></a>

### Iteration sur immo_base : dataset inchangé

In [5]:
immo_base

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...
16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


In [9]:
# on utilise seulement nos variables explicatives

features = [
 'longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity']

In [10]:
# On choisi notre target Y et on place nos features sur X
X = immo_base[features]
Y = immo_base['median_house_value']

# base d'apprentissage et base de test

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(13209, 9)
(3303, 9)
(13209,)
(3303,)


In [11]:
# Entrainement du modèle
 
lmodellineaire = LinearRegression()
lmodellineaire.fit(X_train, Y_train)

# l'entrainement ne fonctionne pas car la colonne "ocean_proximity" est en string

ValueError: could not convert string to float: 'INLAND'

### Iteration sur df_immobilier_no_null : dataset sans les null

In [5]:
df_immobilier_no_null

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,total_residents,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,0.0,1.0,0.0,0.0,0.0
1,1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,1.0,0.0,0.0,0.0,0.0
2,2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,0.0,1.0,0.0,0.0,0.0
3,3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,1.0,0.0,0.0,0.0,0.0
4,4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16331,16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,0.0,1.0,0.0,0.0,0.0
16332,16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,0.0,0.0,0.0,1.0,0.0
16333,16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,0.0,1.0,0.0,0.0,0.0
16334,16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,1.0,0.0,0.0,0.0,0.0


In [14]:
# on utilise seulement nos variables explicatives

features = [
 'longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'total_residents',
 'households',
 'median_income',
 'ocean_proximity_<1H OCEAN',
 'ocean_proximity_INLAND',
 'ocean_proximity_ISLAND',
 'ocean_proximity_NEAR BAY',
 'ocean_proximity_NEAR OCEAN']

In [15]:
# On choisi notre target Y et on place nos features sur X
X = df_immobilier_no_null[features]
Y = df_immobilier_no_null['median_house_value']

# base d'apprentissage et base de test, separation données test et train 

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(13068, 13)
(3268, 13)
(13068,)
(3268,)


In [18]:
# Choose your model
model = LinearRegression()

# Fit the model with the train set
model.fit(X_train, Y_train)

# Evaluate the model with the test set
baseline_score = model.score(X_test, Y_test)
baseline_score

0.6519937753249814

In [16]:
#Model plus lisible mais non fonctionnel en etat

# Evaluation du training set, r2

y_train_predict = lmodellineaire.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)
 
print('La performance du modèle sur la base dapprentissage')
print('--------------------------------------')
print('Lerreur quadratique moyenne est {}'.format(rmse))
print('le score R2 est {}'.format(r2))
print('\n')
 
# model evaluation for testing set

y_test_predict = lmodellineaire.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
r2 = r2_score(Y_test, y_test_predict)
 
print('La performance du modèle sur la base de test')
print('--------------------------------------')
print('Lerreur quadratique moyenne est {}'.format(rmse))
print('le score R2 est {}'.format(r2))

Feature names unseen at fit time:
- ocean_proximity_<1H OCEAN
- ocean_proximity_INLAND
- ocean_proximity_ISLAND
- ocean_proximity_NEAR BAY
- ocean_proximity_NEAR OCEAN
- ...
Feature names seen at fit time, yet now missing:
- ocean_proximity
- population



AttributeError: 'LinearRegression' object has no attribute 'coef_'