# Predicting Melbourne House Prices
## Phase 1: Data Preparation & Visualisation

In [9]:
import warnings
import numpy as np
import pandas as pd
import io
import requests

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

###
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
%config InlineBackend.figure_format = 'retina'
plt.style.use("seaborn")
###

# Sample report: https://github.com/akmand/stats_tutorials/blob/main/Phase1_Report_Sample.ipynb

df1_name = 'diamonds_encoded_scaled_5000.csv'
df1_url = 'https://raw.githubusercontent.com/Jobi060704/math_files/main/' + df1_name
url1_content = requests.get(df1_url, verify=False).content
NaN_df = pd.read_csv(io.StringIO(url1_content.decode('utf-8')))

In [2]:
# name of the dataset to be imported from our GitHub account
df_name = 'diamonds.csv'
df_url = 'https://raw.githubusercontent.com/Jobi060704/math_files/main/' + df_name
url_content = requests.get(df_url, verify=False).content
diamond_df = pd.read_csv(io.StringIO(url_content.decode('utf-8')))

In [3]:
new_diamond_df = diamond_df.drop(columns=diamond_df.columns[0])
new_diamond_df.sample(10, random_state=99)

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,price
3581,0.110187,0.525,0.230769,0.542831,0.097963,0.112893,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2919
3035,0.04158,0.552778,0.230769,0.439479,0.080306,0.093396,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,912
2455,0.106029,0.530556,0.211538,0.53352,0.097793,0.112264,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2365
3014,0.272349,0.497222,0.346154,0.681564,0.123939,0.139937,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11666
3114,0.114345,0.466667,0.326923,0.554935,0.100509,0.111635,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1881
2673,0.201663,0.461111,0.365385,0.640596,0.115789,0.128302,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,6336
767,0.216216,0.469444,0.326923,0.651769,0.118506,0.131761,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10388
2919,0.033264,0.527778,0.211538,0.426443,0.078268,0.089623,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,587
3575,0.112266,0.427778,0.326923,0.561453,0.102886,0.111006,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4336
1253,0.068607,0.519444,0.25,0.486965,0.087436,0.100629,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1753


### Dataset Features

The features in our dataset are described in the table below. These descriptions are taken from the Kaggle data source.

In [4]:
from tabulate import tabulate

table = [['Name','Data Type','Units','Description'],
        ['Carat','Numeric','Carat','Weight of the diamond'],
        ['Cut','Ordinal Categorical','NA','Quality of the cut:\
            \n- Fair\
            \n- Good\
            \n- Very Good\
            \n- Premium\
            \n- Ideal'],
        ['Color','Ordinal Categorical','NA','Diamond colour, from J (worst) to D (best)'],
        ['Clarity','Ordinal Categorical','NA','A measurement of the clarity of a diamond:\
            \n- I1 (worst)\
            \n- SI2\
            \n- SI1\
            \n- VS2\
            \n- VS1\
            \n- VVS2\
            \n- VVS1\
            \n- IF (best)'],
        ['Depth','Numeric','Percent','Total depth percentage = z / mean(x, y) = 2 * z / (x + y) '],
        ['Table','Numeric','Percent','Width of top of diamond relative to widest point '],
        ['Price','Numeric','USD','The price of the diamond'],
        ['X','Numeric','Millimeters','Length of a diamond'],
        ['Y','Numeric','Millimeters','Width of a diamond'],
        ['Z','Numeric','Millimeters','Depth of a diamond']]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒═════════╤═════════════════════╤═════════════╤═══════════════════════════════════════════════════════════╕
│ Name    │ Data Type           │ Units       │ Description                                               │
╞═════════╪═════════════════════╪═════════════╪═══════════════════════════════════════════════════════════╡
│ Carat   │ Numeric             │ Carat       │ Weight of the diamond                                     │
├─────────┼─────────────────────┼─────────────┼───────────────────────────────────────────────────────────┤
│ Cut     │ Ordinal Categorical │ NA          │ Quality of the cut:                                       │
│         │                     │             │ - Fair                                                    │
│         │                     │             │ - Good                                                    │
│         │                     │             │ - Very Good                                               │
│         │                 

### Target Feature

For this project, the target feature in this dataset will be the diamond price in US dollars. That is, the price of diamonds will be predicted based on the explanatory/ descriptive variables. 

## Goals and Objectives

Diamond prices have a very complex calculation system. A model that could acurately predict/set diamond prices is absolutely a requirement. For instance, a jeweler would need such a model to correctly state the properties of the diamonds after the work has been done. Similarly, a store selling diamonds could use such a model to determine the price to sell the diamond at. 

Thus, the main objective of this project is two-fold: (1) predict the price of diamonds based on the publically available properties of diamonds, and (2) which features seem to be the best predictors of the diamond sale price. A secondary objective is to perform some exploratory data analysis by basic descriptive statistics & data visualisation plots to gain some insight into the patterns and relationships existing in the data subsequent to some data cleaning & preprocessing, which is the subject of this Phase 1 report.

At this point, we make the important assumption that rows in our dataset are not correlated. That is, we assume that diamond prices are independent of one another in this dataset. Of course, this is not a very realistic assumption, however, this assumption allows us to circumvent time series aspects of the underlying dynamics of diamond prices and also to resort to rather classical predictive models such as multiple linear regression.

## Data Cleaning and Preprocessing

In this section, we describe the data cleaning and preprocessing steps undertaken for this project.

### Data Cleaning Steps

*   Drop irrelevant features in our dataset
*   Check and rename/ modify some column names
*   Check for missing values
*   Remove all the rows with missing values 
*   Random sampling of the dataset for 5000 rows

#### Taking care of outliers and unusual observations

For numeric features based on the measurement of the real object of a diamond, the observation of the numeric features cannot be equal to zero or any negative values. Additionally, table and depth features must be greater than zero. Since, the features are a percentage between two positive variables.

The table below are a sample of the observations that fulfill these conditions. All 20 of the problematic observations have been dropped from the dataset.

In [5]:
no_outlier_df = new_diamond_df[(new_diamond_df['Carat'] > 0) & (new_diamond_df['Depth'] > 0) & (new_diamond_df['Table'] > 0) & (new_diamond_df['X'] > 0) & (new_diamond_df['Y'] > 0) & (new_diamond_df['Z'] > 0)]
no_outlier_df.sample(10, random_state=99)

Unnamed: 0,Carat,Cut,Color,Clarity,Depth,Table,Price,X,Y,Z
49849,0.56,Ideal,D,VS1,62.6,59.0,2170,5.24,5.31,3.3
9905,1.11,Premium,H,VS1,62.3,57.0,4687,6.67,6.62,4.14
47696,0.52,Ideal,E,VS1,60.0,56.0,1883,5.25,5.29,3.16
11721,0.34,Ideal,E,SI1,61.8,56.0,596,4.45,4.48,2.76
22755,2.01,Fair,H,SI2,66.7,56.0,10772,7.8,7.76,5.19
29220,0.34,Ideal,I,IF,61.3,55.0,695,4.52,4.55,2.78
53886,0.7,Good,D,VS2,58.0,62.0,2749,5.78,5.87,3.38
6939,1.0,Fair,E,SI1,66.3,62.0,4140,6.27,6.06,4.09
48868,0.78,Very Good,J,SI1,59.4,62.0,2035,6.01,6.05,3.58
10061,0.35,Good,F,SI2,63.8,56.0,591,4.48,4.45,2.85


For categorical features, an outlier is defined as a value that doesn't fit the previously defined observations of a feature. There are no problematic observations. Thus, requires no changes to the dataset.

## Data Exploration and Visualisation

Our dataset is now considered to be clean and we are ready to start visualising and explore each of the features.

> LATERRRRRRR

## Summary and Conclusions

> YEEEEEEEEEEEEEEE

## References

- Agrawal, S.. Diamonds (Kaggle). Retrieved September 26, 2022 from https://www.kaggle.com/datasets/shivam2503/diamonds