In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import datetime as dt
#Load data
df=pd.read_csv("./kc_house_data.csv")
df

| Variable      | Description                                                                                                 |
| ------------- | ----------------------------------------------------------------------------------------------------------- |
| id            | A notation for a house                                                                                      |
| date          | Date house was sold                                                                                         |
| price         | Price is prediction target                                                                                  |
| bedrooms      | Number of bedrooms                                                                                          |
| bathrooms     | Number of bathrooms                                                                                         |
| sqft_living   | Square footage of the home                                                                                  |
| sqft_lot      | Square footage of the lot                                                                                   |
| floors        | Total floors (levels) in house                                                                              |
| waterfront    | House which has a view to a waterfront                                                                      |
| view          | Has been viewed                                                                                             |
| condition     | How good the condition is overall                                                                           |
| grade         | overall grade given to the housing unit, based on King County grading system                                |
| sqft_above    | Square footage of house apart from basement                                                                 |
| sqft_basement | Square footage of the basement                                                                              |
| yr_built      | Built Year                                                                                                  |
| yr_renovated  | Year when house was renovated                                                                               |
| zipcode       | Zip code                                                                                                    |
| lat           | Latitude coordinate                                                                                         |
| long          | Longitude coordinate                                                                                        |
| sqft_living15 | Living room area in 2015(implies-- some renovations) This might or might not have affected the lotsize area |
| sqft_lot15    | LotSize area in 2015(implies-- some renovations)                                                            |

In [None]:
df["yr_renovated"].value_counts()

In [None]:
# def to_age():
#     to_age=2023-yr_built_or_reno
#     if df["yr_renovated"]==0: 
#         yr_built_or_reno=df["yr_built"]
#     else:
#         yr_built_or_reno=df["yr_renovated"]
#         return to_age


In [None]:
# df["yr_renovated"].apply(to_age)

In [None]:
df["age"]=2023-df["yr_built"]

In [None]:
df["age"]

In [None]:
df["yr_built"].value_counts()

In [None]:
df_og=df.copy()

In [None]:
df.info()

In [None]:
df.drop(columns=["id","lat","long","zipcode"],inplace=True)
df

In [None]:
df["date"]=pd.to_datetime(df["date"])

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
for column in df.select_dtypes(include='number').columns.tolist() :
    plt.figure(figsize=(15,7))
    plt.subplot(1,2,1)
    plt.title('dist plot of '+column, fontsize=15)
    sns.distplot(df[column],bins=15)
    plt.subplot(1,2,2)
    plt.title('Boxplot of '+column, fontsize=15)
    sns.boxplot(data = df[column], palette="Dark2")
    plt.show()  

In [None]:
df_capped=df.copy()

In [None]:
for column in df_capped.select_dtypes(include='number').columns.tolist() :
    Q1 = df_capped[column].quantile(0.25)
    Q3 = df_capped[column].quantile(0.75)
    IQR = Q3 - Q1
    max_limit = Q3 + (1.5 * IQR)
    min_limit = Q1 - (1.5 * IQR) 
    df_capped[column] = pd.DataFrame(np.where(df_capped[column] > max_limit, max_limit, 
         (np.where(df_capped[column] < min_limit, min_limit, df_capped[column]))), columns=[column])

In [None]:
for column in df_capped.select_dtypes(include='number').columns.tolist() :
    plt.figure(figsize=(15,7))
    plt.subplot(1,2,1)
    plt.title('dist plot of '+column, fontsize=15)
    sns.distplot(df_capped[column],bins=15)
    plt.subplot(1,2,2)
    plt.title('Boxplot of '+column, fontsize=15)
    sns.boxplot(data = df_capped[column], palette="Dark2")
    plt.show()  

In [None]:
df.describe().T

In [None]:
df_capped.describe().T

In [None]:
df_capped["X1 transaction date"].value_counts()

In [None]:
ax1=sns.histplot(df_capped["X1 transaction date"],bins=12,kde=False)
ax1.bar_label(ax1.containers[0], fmt=lambda x: f'{x:0.0f}' if x > 0 else '', label_type='edge')

In [None]:
df_capped["X2 house age"].max()

In [None]:
ax2=sns.histplot(df_capped["X2 house age"],bins=4)
ax2.bar_label(ax2.containers[0], fmt=lambda x: f'{x:0.0f}' if x > 0 else '', label_type='edge')

In [None]:
ax3=sns.histplot(df_capped["X3 distance to the nearest MRT station"],bins=6)
ax3.bar_label(ax3.containers[0], fmt=lambda x: f'{x:0.0f}' if x > 0 else '', label_type='edge')

In [None]:
ax4=sns.histplot(df_capped["X4 number of convenience stores"],bins=5)
ax4.bar_label(ax4.containers[0], fmt=lambda x: f'{x:0.0f}' if x > 0 else '', label_type='edge')

In [None]:
ay=sns.histplot(df_capped["Y house price of unit area"],bins=5)
ay.bar_label(ay.containers[0], fmt=lambda x: f'{x:0.0f}' if x > 0 else '', label_type='edge')

In [None]:
timeseres_x1_y=sns.lineplot(x=df_capped["X1 transaction date"],y=df_capped["Y house price of unit area"],marker="*",markerfacecolor='Red', markersize=10)
timeseres_x1_y.set_title(label="Time Series between Transaction Date and House Price")
timeseres_x1_y.set_xlabel(xlabel="Transaction Date")
timeseres_x1_y.set_ylabel(ylabel="House Price of unit area")

In [None]:
df_capped[["X1 transaction date"]]=df_capped[["X1 transaction date"]].astype("category")

In [None]:
df_capped.info()

In [None]:
sns.countplot(df_capped["X1 transaction date"])

In [None]:
sns.pairplot(df_capped)

In [None]:
print(sns.heatmap.__doc__)

In [None]:
# annot= true value in boxes
# cmap="blues", "coolwarm"
# cbar=(0,1) 0 no cbar ,1 have cbar
# linewidth size of line between the boxes
# cmap viridis , coolwarm, Blues, Rdbu....
# xticklabels, yticklabels

In [None]:
cr=df_capped.corr(numeric_only=True)
sns.heatmap(cr,annot=True,cmap="coolwarm")

In [None]:
matrix=np.triu(cr)
sns.heatmap(
    cr,
    annot=True,
    cmap="RdBu",
    vmin=-1,
    center=0,
    vmax=1,
    cbar=1,
    linewidths=2,
    square=True,
    mask=matrix,
    xticklabels=['X1','X2','X3','X4','X5','X6'],
    yticklabels="auto"
)
plt.title("heatmap of X1 and Y")

In [None]:
col=df.columns.to_list()

In [None]:
fig, hm=plt.subplots()
hm.set_xticklabels(col)
hm.set_yticklabels(col)
plt.pcolormesh(cr, cmap='RdBu',vmin=-1,vmax=1)
plt.title("Heatmap of X1 and Y")
plt.setp(hm.get_xticklabels(),rotation=45,ha="right",
         rotation_mode="anchor")
plt.setp(hm.get_yticklabels(),rotation=0,ha="right")
plt.colorbar()
plt.show()


In [None]:
df_capped.drop(columns=["X2 house age","X6 longitude"],inplace=True)

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
data_x = df_capped.iloc[:,1 : -1]
data_y = df_capped.iloc[:, -1]
x_train,x_test,y_train,y_test=train_test_split(data_x,data_y,test_size=0.2, random_state=21,shuffle=True)
LR=linear_model.LinearRegression()
LR.fit(x_train,y_train)

In [None]:
import statsmodels.api as sm

LR.fit(x_train,y_train)
y_pred=LR.predict(x_test)
y_train_pred=LR.predict(x_train)
x_train=sm.add_constant(x_train)
model=sm.OLS(y_train,x_train).fit()
print(model.summary())

In [None]:
x_test=sm.add_constant(x_test)
model1=sm.OLS(y_test,x_test).fit()
print(model1.summary())

In [None]:
for column in data_x.select_dtypes(include='number').columns.tolist() :
    plt.figure(figsize=(15,7))
    plt.subplot(1,2,1)
    sns.regplot(x=x_train[column],y=y_train)
    plt.show()  

In [None]:
LR.intercept_

In [None]:
LR.coef_

Clustering ,