In [None]:
#import all python modules for activities
import os
import statistics as st
import csv
import math
import sys
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats import linregress
import json
import requests
import config import api_key
import numpy as np

In [None]:
#read the Melbourne City Real Estate data set
melb_data = pd.read_csv('../Resources/Property Sales of Melbourne City.csv')
#print check
melb_data.head()

#need to drop NA values
melb_data.dropna(inplace=True)
#or use this 
# Drop rows with missing or null values in specific columns
#df.dropna(subset=['column1', 'column2', 'column3', 'column4', 'column5', 'column6', 'column7', 'column8', 'column9', 'column10', 'column11', 'column12', 'column13', 'column14', 'column15', 'column16', 'column17', 'column18', 'column19', 'column20', 'column21', 'column22'], inplace=True)


In [None]:
#reset the index if needed
melb_data.reset_index(drop=True, inplace=True)

#Skip first row as it is a header
#melb_data_header = next(melb_data)

In [None]:
#need to drop outliers
#This can be done once we establish quartiles and interquartile range or the zscores

#define a threshold for IQR multiplier
iqr_multiplier = 1.5

#loop through each numeric column and remove outliers using IQR methhod

for col in melb_data.columns:
    if melb_data[col].dtype == 'float64' or melb_data[col].dtype == 'int64':
        Q1 = melb_data[col].quantile(0.25)
        Q3 = melb_data[col].quantile(0.75)
        iqr = Q3 - Q1
        upper_bound = Q3 + iqr_multiplier * iqr
        lower_bound = Q1 - iqr_multiplier * iqr
        melb_data = melb_data[(melb_data[col] <= upper_bound) & (melb_data[col] >= lower_bound)]
        
        #printcheck
        print(f"Values below {lower_bound} could be an outlier")
        print(f"Values above {upper_bound} could be an outlier")

#reset index if needed
melb_data.reset_index(drop=True, inplace=True)

In [None]:
#Q1. What are the main influences on property prices in Melbourne

#landsize and price correlation
landsize = melb_data['Landsize']
price = melb_data['Price']
correlation = st.pearsonr(landsize,price)

#create a linear regression for the above
x_values = landsize
y_values = price
landprice_slope, landprice_int, landprice_rvalue, landprice_pvalue, landprice_stderr = linregress(x_values,y_values)
regress_values = landprice_slope*x_values + landprice_int
line_eq = "y =" + str(round(landprice_slope,2)) + "x + " + str(round(landprice_int,2))

#create a scatter plot for landsize and price
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Landsize")
plt.ylabel("Price of property")
plt.title("Land Size of Property vs. Price of Property")
plt.grid
plt.show()

In [None]:
#Building Area and Price correlation
building_area = melb_data['BuildingArea']
price = melb_data['Price']
correlation = st.pearsonr(building_area,price)

#create a linear regression for the above
x_values = building_area
y_values = price
buildprice_slope, buildprice_int, buildpricervalue, buildpricepvalue, buildpricestderr = linregress(x_values,y_values)
regress_values = x_values * buildprice_slope + buildprice_int
line_eq = "y =" + str(round(buildprice_slope,2)) + "x + " + str(round(buildprice_int,2))

#create a scatter plot
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Building Area")
plt.ylabel("Price of property")
plt.title("Building Area of Property vs. Price of Property")
plt.grid
plt.show()

In [None]:
#Rooms: DO a plot between Rooms and Price.
rooms = melb_data['Rooms']
price = melb_data['Price']
correlation = st.pearsonr(rooms,price)

#create a linear regression for the above
x_values = rooms
y_values = price
roomprice_slope, roomprice_int, roomprice_rvalue, roomprice_pvalue, roomprice_stderr = linregress(x_values,y_values)
regress_values = x_values * roomprice_slope + roomprice_int
line_eq = "y =" + str(round(roomprice_slope,2)) + "x + " + str(round(roomprice_int,2))

#create a scatter plot
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Number of Rooms")
plt.ylabel("Price of property")
plt.title("Number of Rooms vs. Price of Property")
plt.grid
plt.show()

#code for plot if we want to do that

In [None]:
#Number of Bedrooms vs price
bedroom = melb_data['Bedroom2']
price = melb_data['Price']
correlation = st.pearsonr(bedroom,price)

#create a linear regression for the above
x_values = bedroom
y_values = price
bedprice_slope, bedprice_int, bedprice_rvalue, bedprice_pvalue, bedprice_stderr = linregress(x_values,y_values)
regress_values = x_values * bedprice_slope + bedprice_int
line_eq = "y =" + str(round(bedprice_slope,2)) + "x + " + str(round(bedprice_int,2))

#create a scatter plot
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Number of Bedrooms")
plt.ylabel("Price of property")
plt.title("Number of Bedrooms vs. Price of Property")
plt.grid
plt.show()

In [None]:
#Number of Bathrooms and price
bathroom = melb_data['Bathroom']
price = melb_data['Price']
correlation = st.pearsonr(bathroom,price)

#create a linear regression for the above
x_values = bathroom
y_values = price
bathprice_slope, bathprice_int, bathprice_rvalue, bathprice_pvalue, bathprice_stderr = linregress(x_values,y_values)
regress_values = x_values * bathprice_slope + bathprice_int
line_eq = "y =" + str(round(bathprice_slope,2)) + "x + " + str(round(bathprice_int,2))

#create a scatter plot
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Number of Bathrooms")
plt.ylabel("Price of property")
plt.title("Number of Bathrooms vs. Price of Property")
plt.grid
plt.show()

In [None]:
#Number of Parking and price
parking = melb_data['Bathroom']
price = melb_data['Price']
correlation = st.pearsonr(parking,price)

#create a linear regression for the above
x_values = parking
y_values = price
parkprice_slope, parkprice_int, parkprice_rvalue, parkprice_pvalue, parkprice_stderr = linregress(x_values,y_values)
regress_values = x_values * parkprice_slope + parkprice_int
line_eq = "y =" + str(round(parkprice_slope,2)) + "x + " + str(round(parkprice_int,2))

#create a scatter plot
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Number of Parking Spots")
plt.ylabel("Price of property")
plt.title("Number of Parking Spots vs. Price of Property")
plt.grid
plt.show()

In [None]:
#Property Type and price
property_type = melb_data['Type']
price = melb_data['Price']
correlation = st.pearsonr(property_type,price)

#create a linear regression for the above
x_values = property_type
y_values = price
typeprice_slope, typeprice_int, typeprice_rvalue, typeprice_pvalue, typeprice_stderr = linregress(x_values,y_values)
regress_values = x_values * typeprice_slope + typeprice_int
line_eq = "y =" + str(round(typeprice_slope,2)) + "x + " + str(round(typeprice_int,2))

#create a scatter plot
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Property Type")
plt.ylabel("Price of property")
plt.title("Property Type vs. Price of Property")
plt.grid
plt.show()

In [None]:
#Year Build vs price
year_built = melb_data['YearBuilt']
price = melb_data['Price']
correlation = st.pearsonr(year_built,price)

#create a linear regression for the above
x_values = year_built
y_values = price
yearprice_scope, yearprice_int, yearprice_rvalue, yearprice_pvalue, yearprice_stderr = linregress(x_values,y_values)
regress_values = x_values * yearprice_scope + yearprice_int
line_eq = "y =" + str(round(yearprice_scope,2)) + "x + " + str(round(yearprice_int,2))

#create a scatter plot
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Year Built")
plt.ylabel("Price of property")
plt.title("Year Built vs. Price of Property")
plt.grid
plt.show()

In [None]:
#Date of Sale vs price
#Do a bar chart of count of sales binned by month


