In [None]:
# Data Management
import pandas as pd
import numpy as np
# from pandas_datareader.data import DataReader
# from ta import  add_all_ta_features


# Statistics
from statsmodels.tsa.stattools import adfuller

# Data Preproccessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Supervised Machine Learning
from sklearn.model_selection import train_test_split as sk_learn_train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor

# Graphing
import matplotlib.pyplot as plt

## Data Ingestions


In [None]:
df = pd.read_csv("SydneyHousePrices.csv")


print(f"Length of data: {len(df)}" )
df

In [None]:
# Interpertate Data
df.info()

# Feature Engineering

##### handle Non-Numerical Data

- Label Encoding
    - assigning each worded item in a column into a number that the ml algo can comprehend

- OneHot Encoding
    - get all unique values then make a chart of 0's and 1's wherever it showed

In [None]:
# unique suburbs
suburbs_test_unique = df["suburb"].unique()

print("Unique Suburbs Length: " , len(suburbs_test_unique))
print("Preform Label Encoding")

# Label Encoding
labelencoder = LabelEncoder()
encoded_suburbs = labelencoder.fit_transform(df["suburb"])
df["suburbs_encoded"] = encoded_suburbs



all_codes = pd.unique(df["suburbs_encoded"])

for x in range(len(all_codes[-10:])):
    print(suburbs_test_unique[x] , all_codes[x])
df

In [None]:
# Property Type
suburbs_test_unique = df["propType"].unique()

print("Unique Property Types Length: " , len(suburbs_test_unique))
print("Preform OneHot Encoding")


# OneHot Encoding
onehot_encoded = pd.get_dummies(df["propType"], prefix="pt", drop_first=True)
df = df.join(onehot_encoded)

df

## Set Target
- Targets are what the computer should predict

In [None]:
# setting target
df["TARGET"] = df["sellPrice"]

df

## Remove Redundant Columns (features)

In [None]:
clean_df = df.copy() 
clean_df.drop(columns=["Date" , "Id" , "suburb", "propType" , "sellPrice"], inplace=True)

clean_df

# Check for NaN or Inf Values

In [None]:
# Check for Null & Inf
is_null = clean_df.isnull().values.any()
is_inf = clean_df.isin([np.inf,-np.inf]).values.any()

print(f"{is_null =  }")
print(f"{is_inf  =  }")

In [None]:
# Fill NaN variables
clean_df = clean_df.fillna(clean_df.mean())
clean_df.isnull().values.any()


# Min Max Scaling
- In other words. Feature Scaling

In [None]:
df_scaling = clean_df.copy()
mms = MinMaxScaler()
df_scaling[df_scaling.columns]= mms.fit_transform(df_scaling)

df_scaling.head()


# Train Test Split

In [None]:
is_deep_learning = False
df_tts = df_scaling.copy() if is_deep_learning else clean_df.copy()

df_tts

# X is all the data ML Algo is working with
# Y is the data we are trying to predict

In [None]:
# Split X & Y data
X_data = df_tts.iloc[:, : -1].values
y_data = df_tts.iloc[:,  -1].values

print(f"X first value: \n{X_data[0]}")
print(f"y first value: \n{y_data[0]}")

In [None]:
# Train Test Split  (test on 10% )
x_train , x_test , y_train , y_test = sk_learn_train_test_split(
    X_data,
    y_data,
    test_size = 0.1,
    random_state = 1,
    shuffle = True
)

print(f"X Train Shape: {x_train.shape}")
print(f"X Test Shape: {x_test.shape}")
print(f"y Train Shape: {y_train.shape}")
print(f"y Test Shape: {y_test.shape}")

# MACHINE LEARNING
- HYPER PARAMETER TUNING DOCUMENTATION
    - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html


In [None]:
# Train Regressor
regressor = RandomForestRegressor(n_estimators=100 , max_depth=10, random_state=0)

In [None]:
# TRAINING ML 
regressor.fit(x_train, y_train)

In [None]:
# Prediction - Use X Testing set to predict Y axis... then test for confidence on Y_test data set for accuracy
y_prediction = regressor.predict(x_test)
y_prediction = [round(y,0) for y in y_prediction  ]

print("y_prediction:   ", y_prediction[:5])
print("y_test: \t", y_test[:5])

In [32]:
# Check Accuracy using K-Cross_Validation
CrossValidation = RepeatedKFold(n_repeats=3, n_splits=5, random_state=1)


n_scores = cross_val_score(
    regressor ,
    x_train,
    y_train,
    scoring="neg_mean_absolute_error" ,
    cv=CrossValidation,
    n_jobs = -1,
    error_score="raise"
)

In [33]:
# Report Preformance
print("ML Avg: " , abs(n_scores.mean()))
print("ML Std: " , abs(n_scores.std()))

ML Avg:  389118.30686148786
ML Std:  20152.482105475094
