In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
import plotly.express as px
import seaborn as sns
from plotly.offline import init_notebook_mode,iplot
import folium
import geopandas as gpd
from folium.plugins import HeatMap
import plotly.graph_objs as go
import plotly.offline as pyo
import sklearn as metrics
from sklearn import tree, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from matplotlib import rcParams
import scipy.stats as st
from sklearn.metrics import explained_variance_score, median_absolute_error, accuracy_score, confusion_matrix, classification_report, fbeta_score
from sklearn.model_selection import KFold, cross_val_score # Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler # Preprocessing
from sklearn.linear_model import Lasso, Ridge, ElasticNet, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge # Linear models
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  # Ensemble methods
from sklearn.svm import SVR, SVC, LinearSVC  # Support Vector Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline # Streaming pipelines
from sklearn.decomposition import KernelPCA, PCA # Dimensionality reduction
from sklearn.feature_selection import SelectFromModel # Dimensionality reduction
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.base import clone # Clone estimator
from sklearn.metrics import mean_squared_error as MSE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from xgboost import XGBRegressor, plot_importance # XGBoost


In [None]:
## Importing the Data

data = pd.read_csv('kc_house_data.csv')

data.shape

In [None]:
data.head()

#Defining the features

Definig the features
**Id**: the unique number assigned to each house being sold.

**Date**: the date when the house was sold out.

**Price**:the price of the house.

**Bedrooms**:the number of bedrooms in the house.

**Bathrooms**:the number of bathrooms in hte hhouse.

**Sqft_living**:the Square footage of the apartments interior living space.

**Sqft_lot**:the Square footage of the land space.

**Floors**:Number of floors.

**Waterfront**:A dummy variable for whether the apartment was overlooking the waterfront or not.

**View**:An index from 0 to 4 of how good the view of the property was.

**Condition**:An index from 1 to 5 on the condition of the apartment.

**Grade**:An index from 1 to 13, where 1-3 falls short of building construction and design, 7 has an average level of construction and design, and 11-13 have a high quality level of construction and design.

**Sqft_above**: The square footage of the interior housing space that is above ground level.

**Sqft_basement**:The square footage of the interior housing space that is
below ground level.

**Yr_built**:The year the house was initially built

**Yr_renovated**:The year of the house's last renovation

**Zipcode**:What zipcode area the house is in

**Lat**:Lattitude

**Long**:Longitude

**Sqft_living15**:The square footage of interior housing living space for the nearest 15 neighbors

**Sqft_lot15**:The square footage of the land lots of the nearest 15 neighbors

In [None]:
#Exploring Data

In [None]:
#Data inforamtion
data.info()

We have to change the type of date column.
the other columns type are good.

In [None]:
data.info()

The date type is good now !

In [None]:
data.isnull().sum()

No null values in the Data

In [None]:
data.describe()

In [None]:
data[data["bedrooms"]==33]

We found an outlier, it's bizzare that a house have 33 bedrooms with 1.75 bathrooms in surface with 1620 sqft for living so we have 2 choices:

First one we change the number of bedrooms to the median of the number of all the bedrooms in the data.

The second choices is remove this row.

In [None]:
#Now we have changed the number of bedrooms in the row 15870	 with the mean of bedrooms
data.loc[data['bedrooms']==33, 'bedrooms']= data['bedrooms'].mean()

In [None]:
#we have to search if there is duplicated rows or not, and we look at it to decide what we will do with it

In [None]:
#Duplicated data
data_1=data[data.duplicated(subset= 'id')]

data_1

In [None]:
data

In [None]:
data.to_csv("data1")

In [None]:
data1=pd.read_csv("data1")

##Data Visualization

We have to search of hiden outliers and for find them and fixe them if we can
we must create the boxplot !

In [None]:
data1.plot(kind = 'box', subplots= True , layout = (8,3), figsize = (20,20))
plt.show()

We have to delete some columns and change some data.

Delete : waterfront / view / lat / long/ zipcode /yr_build / yr_renovated; but we will not delete all of them now, because we need some of them for the analysis.

Change the max of bedrooms with the mean of the bedrooms column.

Delete some outliers of columns like : sqft_lot / sqft_lot15 / sqft_living_15 / sqft_above

here, we have made a function for helping us to remove the oputliers and make some changes in some columns.

In [None]:
def remove_outlier (house_df, column):
  q1= data1[column].quantile(0.25)
  q2= data1[column].quantile(0.75)

  iqr = q2-q1
  lower=q1 - 1.5*iqr
  upper=q2 - 1.5*iqr

  no_outlier= data1[(data1[column] >= lower) & (data1[column]<= upper)]
  return no_outlier

In [None]:
lot = remove_outlier(data1, 'sqft_lot')

In [None]:
lot = remove_outlier(data1, 'sqft_lot15')

In [None]:
lot = remove_outlier(data1, 'sqft_living15')

In [None]:
lot = remove_outlier(data1, 'sqft_above')

We will use the boxplot again, looking if is there other outliers we didn't recognised in the first one

In [None]:
lot.plot(kind = 'box', subplots= True , layout = (8,3), figsize = (20,20))
plt.show()

in this boxplot we can see that the outliers still there, We can used it in our visualization analysis.

#The visualization part with all the data

In [None]:
#This is the geographic place for the houses.
m = folium.Map(location=[data1['lat'].mean(), data1['long'].mean()], zoom_start=10)
m

In [None]:
#This is the heat map in the geographic map for the houses
heat_data = [[row['lat'],row['long']] for index, row in data1.iterrows()]
HeatMap(heat_data).add_to(m)
HeatMap(heat_data)
m

This geogrphic heat map show us the location of the houses and where the most of houses in our dataset locate!

In [None]:
#we have to search if there is duplicated rows or not, and we look at it to decide what we will do with it

In [None]:
#Duplicated data
data_1=data1[data1.duplicated(subset= 'id')]
data_1

In [None]:
#This is the geo map fot the houses in data_1
m1 = folium.Map(location=[data_1['lat'].mean(), data_1['long'].mean()], zoom_start=10)
m1
#This is the heat map in the geographic map for the houses
heat_data = [[row['lat'],row['long']] for index, row in data_1.iterrows()]
HeatMap(heat_data).add_to(m1)
HeatMap(heat_data)
m1

This chart show us the place of the houses that have a commun id owner!

In [None]:
#this data is for the houses that has been renovated
Cols=['id','yr_renovated','lat','long']

data_2=data1[Cols]

data_2 = data_2[data_2['yr_renovated'] != 0]

data_2 = data_2.reset_index(drop=True)

data_2


In [None]:
m2 = folium.Map(location=[data_2['lat'].mean(), data_2['long'].mean()], zoom_start=10)
m2
#This is the heat map in the geographic map for the houses
heat_data = [[row['lat'],row['long']] for index, row in data_2.iterrows()]
HeatMap(heat_data).add_to(m2)
HeatMap(heat_data)
m2

In [None]:
#We will remove the 'id' columns but first we have to remove the duplicated id rows
data_1=data_1.drop(['id'],axis=1)
data1=data1.drop(['id'],axis=1)

This chart show us the houses that has been renovated

In [None]:
plt.figure(figsize=(15,10))
vmin = np.min(data1.price)
vmax = np.max(data1.price)
norm = colors.LogNorm(vmin*2,vmax/3)
plt.scatter(data1.long,data1.lat, marker='*',c=data1.price,norm=norm,cmap='jet')
plt.xlabel('Longitude')
plt.ylabel('Latituede')
plt.title('House Price by Geography')
clb = plt.colorbar()
clb.ax.set_title('Price')

In [None]:
plt.figure(figsize=(8,8))
plt.hist(data1.price,bins=100,color='b')
plt.title('Histogram of House Price')
plt.show()

In [None]:
hist_yr_b = [go.Histogram(x=data1.yr_built,xbins=dict(start=np.min(data1.yr_built),size=1,end=np.max(data1.yr_built)),marker=dict(color='rgb(0,102,0)'))]

histlayout2 = go.Layout(title="Built Year Counts",xaxis=dict(title="Years"),yaxis=dict(title="Built Counts"))

histfig2 = go.Figure(data=hist_yr_b,layout=histlayout2)

iplot(hist_yr_b)

In [None]:
hist_yr_rn = [go.Histogram(x=data1.yr_renovated,xbins=dict(start=np.min(data1.yr_renovated),size=1,end=np.max(data1.yr_renovated)),marker=dict(color='rgb(0,102,0)'))]

histlayout3 = go.Layout(title="renovate Year Counts",xaxis=dict(title="Years"),yaxis=dict(title="renovate Counts"))

histfig3 = go.Figure(data=hist_yr_rn,layout=histlayout3)

iplot(hist_yr_rn)

In [None]:
#Create Grade Frame
gradeframe = pd.DataFrame({"Grades":data1.grade.value_counts().index,"House_Grade":data1.grade.value_counts().values})
gradeframe["Grades"] = gradeframe["Grades"].apply(lambda x : "Grade " + str(x))
gradeframe.set_index("Grades",inplace=True)
gradeframe

In [None]:
p1 = [go.Pie(labels = gradeframe.index,values = gradeframe.House_Grade,hoverinfo="percent+label+value",hole=0.1,marker=dict(line=dict(color="#000000",width=2)))]

layout4 = go.Layout(title="Grade Pie Chart")

fig4 = go.Figure(data=p1,layout=layout4)

iplot(fig4)

In [None]:
hist_grade = [go.Histogram(x=data1.grade,marker=dict(color='rgb(102, 0, 102)'))]

histlayout1 = go.Layout(title="Grade Counts",xaxis=dict(title="Grades"),yaxis=dict(title="Counts"))

histfig1 = go.Figure(data=hist_grade,layout=histlayout1)

iplot(hist_grade)

In [None]:
fig_his_1 = px.histogram(data1,x='sqft_living',nbins=15)
fig_his_1

in this scatter plot we can see much things and we can make some search on it for example we can search on two things:

First one the data point where the sqft_lot is little more than 1.6M while the price is 700K (probably a farm with a very little house ).

Second one is when the price is the max while the sqft_lot is 27.6K sqft ( probably the house is in luxury place )

In [None]:
trace2 = px.scatter(data1,x='sqft_living',y='price')
trace2

We have to change the data type

In [None]:
data1['date']= pd.to_datetime(data1['date'])

In [None]:
plt.figure(figsize=(20,7))
sns.heatmap(data1.corr() , annot = True, cmap = "Blues")

In [None]:
# Create a 3D scatterplot

In [None]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(data1['sqft_living'], data1['price'], data1['condition'], c=data1['condition'], cmap='viridis')
ax.set_xlabel('Sqft Living')
ax.set_ylabel('Price')
ax.set_zlabel('Condition')

# Add a color bar to the right of the plot
colorbar = plt.colorbar(scatter)
colorbar.set_label('Condition')

plt.show()

In [None]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(data1['sqft_living'], data1['price'], data1['floors'], c=data1['floors'], cmap='viridis')
ax.set_xlabel('Sqft Living')
ax.set_ylabel('Price')
ax.set_zlabel('Floors')

# Add a color bar to the right of the plot
colorbar = plt.colorbar(scatter)
colorbar.set_label('floors')

plt.show()

In [None]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(data1['sqft_living'], data1['price'], data1['grade'], c=data1['grade'], cmap='viridis')
ax.set_xlabel('Sqft Living')
ax.set_ylabel('Price')
ax.set_zlabel('Grade')

# Add a color bar to the right of the plot
colorbar = plt.colorbar(scatter)
colorbar.set_label('grade')

plt.show()

In [None]:
bedrooms = data1.bedrooms.value_counts()


plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)


ax1 = plt.subplot(221)
ax1 = sns.countplot(x="bedrooms", data=data1,
                    ax=ax1)
ax1.set_title("bedrooms counting", fontsize=15)
ax1.set_xlabel("Bathrooms number")
ax1.set_ylabel("count")

ax2 = plt.subplot(222)
ax2 = sns.regplot(x="bedrooms", y='price',
                  data=data1, ax=ax2, x_jitter=True)
ax2.set_title("Bedrooms distribution price", fontsize=15)
ax2.set_xlabel("Bedrooms number")
ax2.set_ylabel("Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.stripplot(x="bedrooms", y="price",
                    data=data1)
ax0.set_title("Better understaning price", fontsize=15)
ax0.set_xlabel("Bedrooms")
ax0.set_ylabel("Price(US)")

plt.show()

In [None]:
data1["bathrooms"] = data1['bathrooms'].round(0).astype(int)

print("Freuency bathroom description:")
print(data1["bathrooms"].value_counts())

plt.figure(figsize = (12,8))
plt.subplots_adjust(hspace = 0.4, top = 0.8)

ax1 = plt.subplot(221)
ax1 = sns.countplot(x="bathrooms", data=data1,
                    ax=ax1)
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax1.set_title("Bathrooms counting", fontsize=15)
ax1.set_xlabel("Bathrooms number")
ax1.set_xlabel("count")

ax2 = plt.subplot(222)
ax2 = sns.boxplot(x="bathrooms", y='price',
                  data=data1, ax=ax2)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
ax2.set_title("Bathrooms distribution price", fontsize=15)
ax2.set_xlabel("Bathrooms number")
ax2.set_ylabel("Price(US)")

ax0 = plt.subplot(212)
ax0 = sns.stripplot(x="bathrooms", y="price",
                    data=data1 , alpha=0.5,
                    jitter=True, hue="condition")
ax0.set_title("Better view distribuition through price", fontsize=15)
ax0.set_xlabel("Bathroom number")
ax0.set_ylabel("Price(US)")
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=90)

plt.show()

In [None]:
condition = data1['condition'].value_counts()

print("Condition counting: ")
print(condition)

fig, ax = plt.subplots(ncols=2, figsize=(14,5))
sns.countplot(x='condition', data=data1, ax=ax[0])
sns.boxplot(x='condition', y= 'price',
            data=data1, ax=ax[1])
plt.show()

In [None]:
data1

In [None]:
data1.to_csv("data2")

In [None]:
data2=pd.read_csv("data1")

In [None]:
data=data2

In [None]:
data['date']= pd.to_datetime(data['date'])


In [None]:
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

data = data.drop("date",axis=1)

##Machine Learning Part

In [None]:
#XGBoots Model

In [None]:
X = data.drop("price",axis=1).values
y = data["price"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=3)

In [None]:
thresh = 5 * 10**(-3)
model = XGBRegressor()
model.fit(X_train, y_train)
#select features using threshold
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
# eval model
select_X_val = selection.transform(X_test)
# test
select_X_test = selection.transform(X_test)

In [None]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
y_hat = xgb.predict(X_test)

In [None]:
xgb.score(X_test,y_test)

In [None]:
predictions = xgb.predict(X_test)
print(explained_variance_score(predictions,y_test))

In [None]:
#Linearr Regression Model

In [None]:
regr = linear_model.LinearRegression()
new_data = data[['sqft_living','grade', 'sqft_above', 'sqft_living15','bathrooms','view','sqft_basement','lat','waterfront','yr_built','bedrooms']]

In [None]:
X = data.drop("price",axis=1).values
y = data["price"].values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25, random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.fit_transform(X_test)

In [None]:
lin = LinearRegression()

In [None]:
poly = PolynomialFeatures(degree=3)
X_poly_train = poly.fit_transform(X_train_scaler)
X_test_poly = poly.fit_transform(X_test_scaler)
poly.fit(X_poly_train, y_train)
lin.fit(X_poly_train, y_train)

In [None]:
y_pred = lin.predict(X_test_poly)
y_pred

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
mean_absolute_error,
y_pred_train = lin.predict(X_poly_train)
mean_absolute_error(y_train, y_pred_train)

In [None]:
poly = PolynomialFeatures(degree=3)
X_poly_train = poly.fit_transform(X_train_scaler)
X_test_poly = poly.transform(X_test_scaler)
poly.fit(X_poly_train, y_train)

In [None]:
regr.fit(X_train, y_train)
print(regr.predict(X_test))

In [None]:
regr.score(X_test,y_test)

In [None]:
import math

# Calculate the Root Mean Squared Error
print("RMSE: %.2f"
      % math.sqrt(np.mean((regr.predict(X_test) - y_test) ** 2)))