# Projet
Bienvenue chez EPSI DEV! Votre première mission consiste à construire un modèle
des prix immobiliers en Californie en vous basant sur les données de recensement.
Ces données comportent un certain nombre de variables quantitatives telles que la
population, le revenu médian et le prix médian des habitations pour chacun des
“Blocks” recensés en Californie. 

Ces “Blocks” constituent la plus petite subdivision pour laquelle les services américains du recensement publient des données(en
général, ils regroupent de 600 à 3000 personnes). Nous les appellerons “districts”
dans ce qui suit.

Votre modèle devra apprendre de ces données et être capable de prédire le prix
médian des habitations dans chaque district, en fonction des autres variables
connues.

Pour l’instant nous allons juste nous focaliser sur l’analyse exploratoire et le
nettoyage des données du dataset, le reste du travail sera fait par la suite lorsque
nous aurons assimiler des notions plus approfondies sur le machine learning.

# Importation des librairies

In [10]:
import pandas as pd

# Lecture du dataset

In [11]:
df = pd.read_csv("housing.csv")

# Affichage des 5 premières lignes et des 5 dernières lignes

In [12]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [13]:
from math import sqrt, cos, sin, asin

distance_array = []

for long,lat in zip(df['longitude'], df['latitude']):
    lat2 = 37.787994
    long2 = -122.407437
    distance_array.append(2 * 6371 * asin(sqrt(sin(((lat2 - lat)/2) + cos(lat) * cos(lat2) * sin((long2 - long)/2)**2)**2)))

df.drop(['longitude', 'latitude'], axis=1, inplace=True)
df.reset_index(inplace=True)

df['distance'] = distance_array

df

Unnamed: 0,index,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,distance
0,0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,488.161612
1,1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,349.038415
2,2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,307.302745
3,3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,317.448836
4,4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,317.448836
...,...,...,...,...,...,...,...,...,...,...
20635,20635,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,11771.469185
20636,20636,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,11723.575548
20637,20637,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,11094.433869
20638,20638,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,11002.664697


# Informations sur le dataset (nombre de lignes, nombre de colonnes, type de chaque colonne, nombre de valeurs non nulles)

In [14]:
# Description du jeu de données
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               20640 non-null  int64  
 1   housing_median_age  20640 non-null  float64
 2   total_rooms         20640 non-null  float64
 3   total_bedrooms      20433 non-null  float64
 4   population          20640 non-null  float64
 5   households          20640 non-null  float64
 6   median_income       20640 non-null  float64
 7   median_house_value  20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   distance            20640 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.6+ MB


# Affichage du nombre de valeurs qualitative

In [15]:
# Afficher les modalité de la variables `ocean_proximity`
df["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

# Analyse descriptive univariée

In [16]:
df.describe(include="all")

Unnamed: 0,index,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,distance
count,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0,20640,20640.0
unique,,,,,,,,,5,
top,,,,,,,,,<1H OCEAN,
freq,,,,,,,,,9136,
mean,10319.5,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909,,10525.970502
std,5958.399114,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874,,6421.444615
min,0.0,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,,0.41628
25%,5159.75,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0,,3915.792398
50%,10319.5,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0,,11520.65884
75%,15479.25,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0,,16322.339072


# Bibliothèque pour la visualisation

In [17]:
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

# Affichage de la localisation des maisons avec un scatter et alpha

In [None]:
plt.scatter(df["longitude"], df["latitude"], alpha=0.1)
plt.show()

# Répartition des localisations des maisons

In [None]:
plt.figure(figsize=(10, 20))
plt.pie(x=df['ocean_proximity'].value_counts(), labels=df['ocean_proximity'].value_counts().index, autopct='%0.2f%%')
plt.title("Répartition des valeurs de la variable `ocean_proximity`")
plt.savefig("repartition_ocean_proximity.png")
plt.show()

# Affichage de la répartition des prix

In [None]:
plt.hist(df["median_house_value"], bins=50)
plt.show()


# Localisation des maisons

In [18]:
import folium
import folium.plugins

cali_map = folium.Map(location=["36.778259", "-119.417931"], zoom_start=11)
marker_cluster = folium.plugins.MarkerCluster().add_to(cali_map)

for lat, lng, in zip(df["latitude"], df["longitude"]):
    folium.Marker([lat, lng]).add_to(marker_cluster)

cali_map.save("cali_map.html")
cali_map

ModuleNotFoundError: No module named 'folium'

# Affichage de la corélation entre le prix et le nombre de chambre

In [19]:
# Correlate median_house_value and total_rooms
plt.scatter(df["median_house_value"], df["total_rooms"], alpha=0.4)

NameError: name 'plt' is not defined

# Test de corrélation

In [20]:
# Correlate median_house_value and median_income
corr_matrix = df.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

ValueError: could not convert string to float: 'NEAR BAY'

In [None]:
import seaborn as sns

In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
sns.heatmap(corr_matrix, annot=True, ax=ax, cmap='RdYlGn', center=0)

# Tests de variance ou d'anova
Quand PR(>F) est inférieur à 0.05, on peut correler les variables, sinon on ne peut pas les corréler.

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('median_house_value ~ ocean_proximity', data=df).fit()
anova_result = sm.stats.anova_lm(model, typ=2)
anova_result

In [None]:
# =acos(sin(lat1)sin(lat2)+cos(lat1)cos(lat2)cos(lon2-lon1))6371 (6371 is Earth radius in km.)
from math import acos, sin, cos, radians, asin


def get_distance(lat, lon):
    # point centrale de la Californie
    lat_cal, lon_cal = 0.6497784708819503, -2.095224709746546
    
    # convertir les degrés en radians
    lat = radians(lat)
    lon = radians(lon)
    
    # calculer la distance
    return acos((sin(lat) * sin(lat_cal)) + (cos(lat) * cos(lat_cal) * cos(lon_cal-lon))) * 6371

In [None]:
# add new column with distance and drop lat lon

# loop over the dataframe, get the lat, lon and index
for lat, lon, index in zip(df["latitude"], df["longitude"], df.index):
    # add the distance to the dataframe at the index
    df.loc[index, "distance"] = get_distance(lat, lon)


# drop latitude and longitude
df.drop(["latitude", "longitude"], axis=1, inplace=True)

# Gestion des valeurs manquantes

In [None]:
df.isna().sum()

# Drop the rows with missing values

In [None]:
# inplace=True to modify the dataframe
df.dropna(inplace=True)

In [None]:
# Afficher le nombre de valeurs manquantes par variable après le nettoyage
df.isna().sum()

In [21]:
# Show the number of duplicated rows
df[df.duplicated()]

Unnamed: 0,index,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,distance


# Tukey diagram for all the variables

In [22]:
# iterate over columns, create boxplot
# figure is created for each column to avoid overlapping
for col, _ in df.items():
    plt.figure()
    df.boxplot([col])

NameError: name 'plt' is not defined

In [23]:
# each line is a block

import seaborn as sns

sns.boxplot( y = df["median_income"], data=df)
plt.show()

ModuleNotFoundError: No module named 'seaborn'

# Remove values between min and max

In [24]:
# max = q3 + 1.5 * IQR
# min = q1 - 1.5 * IQR

import numpy as np

# Compute quartiles and interquartile range
q1, q3 = np.percentile(df["median_income"], [25, 75])
iqr = q3 - q1

# Compute outlier boundaries
upper_bound = q3 + 1.5 * iqr
lower_bound = q1 - 1.5 * iqr

# Filter outliers
outliers = df[(df["median_income"] > upper_bound) | (df["median_income"] < lower_bound)]

# remove from df
df = df.drop(outliers.index)

In [25]:
# each line is a block

import seaborn as sns

sns.boxplot( y = df["median_income"], data=df)
plt.show()

ModuleNotFoundError: No module named 'seaborn'

In [26]:
df.to_csv("clean_dataset.csv")