# ImmoEliza Data Analysis

## Data Cleaning

### Import Necessary Librairies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


### Setting the dataset in a variable called "data"

In [None]:
data = pd.read_json("final_dataset.json")
pd.options.display.max_columns = 35             #just extending the number  of columns that can be seen
pd.options.display.max_colwidth = 120           #just extending the width of columns to be able to click on Url for checking infos
data.shape

### Checking  if there is any duplicates

In [None]:
data.drop_duplicates("PropertyId",inplace=True)
data.shape



### Seems not let's move on to the next step which  is checking null values

In [None]:
data.isnull().sum()

### Get rid of null values when possible

In [None]:
data.update(data[["BathroomCount","Fireplace","Furnished","Garden","GardenArea","SwimmingPool","Terrace","ToiletCount"]].fillna(0))
data.shape

### Get rid of potentials encoding errors

In [None]:
data.drop(data[data.BathroomCount > data.BedroomCount].index,inplace=True)
data.drop(data[data.ConstructionYear > 2033].index,inplace=True)
data.drop(data[data.GardenArea > data.SurfaceOfPlot].index,inplace=True)
data.drop(data[data.PostalCode < 1000].index,inplace=True)
data.drop(data[data.NumberOfFacades > 4].index,inplace=True)
data.drop(data[data.Price > 15000000].index,inplace=True)
data.drop(data[data.ToiletCount > 58].index,inplace=True)
data.drop(data[data.ShowerCount > 58].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_monthly_amount"].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_without_lump_sum"].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_lump_sum"].index,inplace=True)


data.shape

### Super looks like we deleted more than 4000 errors. Great!

### Separate sale data and rent data to express more things

In [None]:
sale_data = data[data.TypeOfSale == "residential_sale"]
rent_data = data[data.TypeOfSale == "residential_monthly_rent"]
print(sale_data.shape)
print(rent_data.shape)

## Data Analysis

### See how many rows and columns we have

In [None]:
print(sale_data.shape)
print(rent_data.shape)

In [None]:
sale_data.head()

In [None]:
rent_data.head()

### See correlation between price and place

#### price/region

In [None]:
price_sale_region = sale_data.groupby("Region")[["Price"]].mean().round()
price_rent_region = rent_data.groupby("Region")[["Price"]].mean().round()
price_sale_region.plot.bar()
plt.title("Mean price by region")
plt.xlabel("Regions")
plt.ylabel("Mean Price")
price_rent_region.plot.bar()
plt.title("Mean price by region")
plt.xlabel("Regions")
plt.ylabel("Mean Price")
plt.show()

#### price/province

In [None]:
price_sale_province = sale_data.groupby("Province")[["Price"]].mean().round().sort_values("Price",ascending=False)
price_rent_province = rent_data.groupby("Province")[["Price"]].mean().round().sort_values("Price",ascending=False)
price_sale_province.plot.bar()
plt.title("Mean price by province")
plt.xlabel("Provinces")
plt.ylabel("Mean Price")
price_rent_province.plot.bar()
plt.title("Mean price by province")
plt.xlabel("Provinces")
plt.ylabel("Mean Price")
plt.show()

#### Price/District

In [None]:
price_sale_district = sale_data.groupby("District")[["Price"]].mean().round().sort_values("Price",ascending=False)
price_rent_district = rent_data.groupby("District")[["Price"]].mean().round().sort_values("Price",ascending=False)
price_sale_district.plot.bar()
plt.title("Mean price per district")
plt.xlabel("Districts")
plt.ylabel("Mean Price")
price_rent_district.plot.area()
plt.title("Mean price per district")
plt.xlabel("Districts")
plt.ylabel("Mean Price")
plt.show()

In [None]:
# price_sale_district = sale_data.groupby("District")[["Price"]].mean().round().sort_values("Price",ascending=False)
# price_rent_district = rent_data.groupby("District")[["Price"]].mean().round().sort_values("Price",ascending=False)

In [None]:
sale_data.head()

In [None]:
p = sns.catplot(data=sale_data, x="District", y="Price")
plt.xticks(rotation=45)
plt.show()