# Data Collection
## Installation of the required libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

## Load the dataset

In [None]:
df = pd.read_csv("housePrice.csv")
df

In [None]:
df.describe()

# Data Preprocessing

In [None]:
df.info(verbose=True)

#### *look at address 
we have some None value in this column.
## Step 1:Drop Nan address from dataframe

In [None]:
df = df.dropna(subset = ["Address"])
df.reset_index(drop=True, inplace=True)
df1 = df.copy()

## Step 2:Change string feature to numeric
make True/False as 0/1 number to be usable in progress

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
input1 = ["Parking","Warehouse","Elevator"]
df1[input1] = df1[input1].apply(le.fit_transform)
df1

## Step 3:Understanding Outliers
"ISM" function used to detect and remove outliers in a column of a DataFrame. The function first converts the specified column to floating-point numbers and removes any commas (,). Then, by computing the mean and standard deviation, it uses the 8-standard deviation method to determine the upper and lower bounds for outliers.
### * In the info cell we found out that Area is not just a numeric data

In [None]:
a = pd.to_numeric(df.Area, errors='coerce')
idx = a.isna()
df[idx]

### * Remove commas and convert to numeric

In [None]:
df1["Area"] = df1["Area"].astype(str).str.replace(',', '').astype(float)

### * Drop Outliers

In [None]:
def ISM(df, Column_name):
    
    column_data = df [Column_name]
    mean = column_data.mean()
    std_dev = column_data.std()
    threshold = 5 * std_dev
    lower_bound = mean - threshold
    upper_bound = mean + threshold
    outliers = column_data [(column_data < lower_bound) | (column_data > upper_bound)]
    out_temp = outliers.copy()      
    df.drop(out_temp.index, inplace=True)  # Drop rows containing outliers
    df.reset_index(drop=True, inplace=True)  # Reset index to be sequential
    return outliers

In [None]:
out = ISM(df1,"Area")
if len(out) == 0:
    print("We don't have Outliers")
else:
    print("Outliers in the column:")
    print(out)

# Feature Engineering

## Step 1:sort by Addresses 
we know that the average price in each area of Tehran is different so we want to sort data from cheap to expensive to find the better fit function for our regression.
### Sort by mean price of each Area

In [None]:
unique_add = df["Address"].unique()
Add = []
Mean = []
for i in unique_add:
    df_temp = df1[df1["Address"] == i]
    mean = (df_temp["Price"]/df_temp["Area"]).mean()
    Add.append(i)
    Mean.append(mean)
per_area = pd.DataFrame({'Address': Add, 'Price': Mean})
per_area = per_area.sort_values(by='Price')
per_area.reset_index(drop=True, inplace=True)
per_area

### * we change addresses to numric for use in regression model and drop extra columns

In [None]:
df2 = pd.DataFrame([])
for idx, i in enumerate(per_area["Address"], start=1):
    df_temp = df1[df1['Address'] == i].copy()
    df_temp["Address"] = idx
    df_temp["Price"] = df_temp["Price"]/df_temp["Area"]
    df_temp = df_temp.rename(columns={"Price": "PSM"})
    df_temp = df_temp.drop(["Area", "Price(USD)"], axis=1)
    df2 = pd.concat([df2, df_temp], axis=0)
df2.reset_index(drop=True, inplace=True)
ISM (df2, "PSM")
df2

In [None]:
sns.heatmap(df2.corr())

### Address has most effect on price. :)  
and we can see the effect of must of features is less than 0.5 points from 1 but try it next of address effect

In [None]:
sns.pairplot(df2)

### * Creating train and test dataset

In [None]:
msk = np.random.rand(len(df2)) < 0.8
train = df2[msk]
test = df2[~msk]

# Simple Regression Model
## Address Effect
first of all we try to found a regression for house price per area that we use price per square meter for 192 area 

In [None]:
plt.scatter(train.Address, train.PSM, s=2, color='blue')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.grid(True)
plt.show()

## Step 1: Data normalization

In [None]:
def anyarray(Data):
    Data_x = np.asanyarray(Data[["Address"]])
    Data_y = np.asanyarray(Data[["PSM"]])
    # Lets normalize our data
    Normalx =Data_x/max(Data_x)
    Normaly =Data_y/max(Data_y)
    return Normalx, Normaly

## Step 2: Model Training

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

train_x, train_y = anyarray(train)
poly = PolynomialFeatures(degree=3)
train_x_poly = poly.fit_transform(train_x)
clf = linear_model.LinearRegression()
train_y_hat = clf.fit(train_x_poly, train_y)
popt = clf.coef_[0]

print(" beta_0 = %.2f, beta_1 = %.2f, beta_2 = %.2f, beta_3 = %.2f" % (popt[0], popt[1], popt[2], popt[3]))

## Step 3: Model Prediction

In [None]:
plt.scatter(train_x, train_y, s=2, color='blue', label='data')
X = np.arange(0.0, 1.1, 0.1)
Y = clf.intercept_[0] + popt[1]*X + popt[2]*np.power(X, 2) + popt[3]*np.power(X, 3)
plt.plot(X, Y, '--r', linewidth=2, label='fit')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)
plt.show()

## Step 4: Model Evaluation

In [None]:
from sklearn.metrics import r2_score

test_x, test_y = anyarray(test)
test_x_poly = poly.fit_transform(test_x)
test_y_hat = clf.predict(test_x_poly)

print("Mean absolute error: %.4f" % np.mean(np.absolute(test_y_hat - test_y)))
print("Mean squares  error: %.4f" % np.mean((test_y_hat - test_y) ** 2))
print("R2-score: %.2f" % r2_score(test_y,test_y_hat ) )

### * we decide to use the mean price of each area as the candidate for that place

In [None]:
def Mean_func(dataframe):
    unique_add = dataframe["Address"].unique()
    Add = []
    Mean = []
    for i in unique_add:
        df_temp = dataframe[dataframe["Address"] == i]
        mean = df_temp["PSM"].mean()
        Add.append(i)
        Mean.append(mean)
    per_area = pd.DataFrame({'Address': Add, 'PSM': Mean})
    per_area = per_area.sort_values(by='PSM')
    x, y = anyarray(per_area)
    return x, y

### * Make our model

In [None]:
def Model(train_x, train_y):
    poly = PolynomialFeatures(degree=3)
    train_x_poly = poly.fit_transform(train_x)
    reg = linear_model.LinearRegression()
    train_y_hat = reg.fit(train_x_poly, train_y)
    return reg


# Parking effect

In [None]:
True_p = train[train["Parking"] == 1]
False_p = train[train["Parking"] == 0]
tp_x, tp_y = Mean_func(True_p)
fp_x, fp_y = Mean_func(False_p)
plt.scatter(tp_x, tp_y, s=10, color='blue', label='Has Parking')
plt.scatter(fp_x, fp_y, s=10, color='red', label='No Parking')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)
plt.show()

## Step 1: Model Training

In [None]:
X = np.arange(0.0, 1.1, 0.1)
tp_reg = Model(tp_x, tp_y)
fp_reg = Model(fp_x, fp_y)
Yt = tp_reg.intercept_[0] + tp_reg.coef_[0][1]*X + tp_reg.coef_[0][2]*np.power(X, 2) + tp_reg.coef_[0][3]*np.power(X, 3)
Yf = fp_reg.intercept_[0] + fp_reg.coef_[0][1]*X + fp_reg.coef_[0][2]*np.power(X, 2) + fp_reg.coef_[0][3]*np.power(X, 3)

## step 2: Model Prediction

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(tp_x, tp_y, s=15, color='blue', label='Has Parking')
plt.plot(X, Yt, '--g', linewidth=3, label='fit Has Parking')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.scatter(fp_x, fp_y, s=15, color='red', label='No Parking')
plt.plot(X, Yf, '--g', linewidth=3, label='fit No Parking')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)

plt.tight_layout()
plt.show()

# Warehouse effect

In [None]:
True_W = train[train["Warehouse"] == 1]
False_W = train[train["Warehouse"] == 0]
tw_x, tw_y = Mean_func(True_W)
fw_x, fw_y = Mean_func(False_W)
plt.scatter(tw_x, tw_y, s=10, color='blue', label='Has Warehouse')
plt.scatter(fw_x, fw_y, s=10, color='red', label='No Warehouse')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)
plt.show()

## Step 1: Model Training

In [None]:
X = np.arange(0.0, 1.1, 0.1)
tw_reg = Model(tw_x, tw_y)
fw_reg = Model(fw_x, fw_y)
Yt = tw_reg.intercept_[0] + tw_reg.coef_[0][1]*X + tw_reg.coef_[0][2]*np.power(X, 2) + tw_reg.coef_[0][3]*np.power(X, 3)
Yf = fw_reg.intercept_[0] + fw_reg.coef_[0][1]*X + fw_reg.coef_[0][2]*np.power(X, 2) + fw_reg.coef_[0][3]*np.power(X, 3)

## step 2: Model Prediction

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(tw_x, tw_y, s=15, color='blue', label='Has Warehouse')
plt.plot(X, Yt, '--g', linewidth=3, label='fit Has Warehouse')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.scatter(fw_x, fw_y, s=15, color='red', label='No Warehouse')
plt.plot(X, Yf, '--g', linewidth=3, label='fit No Warehouse')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)

plt.tight_layout()
plt.show()


# Elevator effect

In [None]:
True_E = train[train["Elevator"] == 1]
False_E = train[train["Elevator"] == 0]
te_x, te_y = Mean_func(True_E)
fe_x, fe_y = Mean_func(False_E)
plt.scatter(te_x, te_y, s=10, color='blue', label='Has Elevator')
plt.scatter(fe_x, fe_y, s=10, color='red', label='No Elevator')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)
plt.show()

## Step 1: Model Training

In [None]:
X = np.arange(0.0, 1.1, 0.1)
te_reg = Model(te_x, te_y)
fe_reg = Model(fe_x, fe_y)
Yt = te_reg.intercept_[0] + te_reg.coef_[0][1]*X + te_reg.coef_[0][2]*np.power(X, 2) + te_reg.coef_[0][3]*np.power(X, 3)
Yf = fe_reg.intercept_[0] + fe_reg.coef_[0][1]*X + fe_reg.coef_[0][2]*np.power(X, 2) + fe_reg.coef_[0][3]*np.power(X, 3)

## Step 2: Model Prediction

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(te_x, te_y, s=15, color='blue', label='Has Elevator')
plt.plot(X, Yt, '--g', linewidth=3, label='fit Has Elevator')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.scatter(fe_x, fe_y, s=15, color='red', label='No Elevator')
plt.plot(X, Yf, '--g', linewidth=3, label='fit No Elevator')
plt.title("House price per Area")
plt.xlabel("Address (numric)")
plt.ylabel("Price (Per squre meter)")
plt.legend(loc='best')
plt.grid(True)

plt.tight_layout()
plt.show()

# Result

|Regression models|  Detail |
|---|---|
|  clf  | predict price without Parking,Wear house,Elevator effect |
| tp_reg | predict price whit parking effect |
| fp_reg | predict price without parking effect |
| tw_reg | predict price whit Wear house effect |
| fw_reg | predict price without Wear house effect |
| te_reg | predict price whit Elevator effect |
| fe_reg | predict price without Elevator effect |