In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Load the data
df= pd.read_csv('/workspaces/First-data-project/Kaggle Project/Data/Cleaned Data/df_cleaned.csv')

df['date'] = pd.to_datetime(df['date']) 

df.drop(['Unnamed: 0'], axis=1, inplace=True)

df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,sqft_above,sqft_basement,...,sqft_lot15,date,yr_renovated,zipcode,lat,long,view,waterfront,sale_month,decade_built
0,993000.0,4,2.0,2850,14810,2.0,5,8,2490,360,...,10454,2014-09-02,0,98004,47.5892,-122.203,0,0,9,1950
1,991700.0,4,3.0,2290,2350,2.0,3,9,1610,680,...,3820,2014-07-23,2011,98116,47.574,-122.415,1,0,7,1920
2,990400.0,3,2.5,2100,4097,2.0,3,9,2100,0,...,4764,2015-02-19,0,98004,47.5983,-122.2,0,0,2,2000
3,990000.0,4,2.5,2430,6325,2.0,4,8,2020,410,...,4375,2014-08-18,0,98109,47.6413,-122.354,0,0,8,1910
4,990000.0,3,2.5,2160,6000,1.5,4,8,1880,280,...,6000,2014-07-17,0,98105,47.6582,-122.28,0,0,7,1930


**Correlation Values**

- Analyses of the features that correspond most to price.
- Arranged by absolute value to see which features trend along with price the most, regardless of negative or positive correlation. 

In [2]:
corr_matrix = df.corr()

price_correlations = corr_matrix['price'].abs().sort_values(ascending=False)
print(price_correlations)

price            1.000000
grade            0.546066
sqft_living      0.521301
lat              0.483746
sqft_living15    0.456696
sqft_above       0.406218
bathrooms        0.357516
floors           0.236486
bedrooms         0.235952
sqft_basement    0.217849
view             0.206887
sqft_lot15       0.096094
yr_renovated     0.095674
sqft_lot         0.083716
condition        0.060390
waterfront       0.043847
yr_built         0.030336
decade_built     0.028336
zipcode          0.026114
sale_month       0.015259
long             0.008057
date             0.003798
Name: price, dtype: float64


**Top Correlations**

These features have the highest correlation with the price of the house and will be selected for training the model.

In [3]:
top_corr = price_correlations[price_correlations > 0.2]
top_corr

price            1.000000
grade            0.546066
sqft_living      0.521301
lat              0.483746
sqft_living15    0.456696
sqft_above       0.406218
bathrooms        0.357516
floors           0.236486
bedrooms         0.235952
sqft_basement    0.217849
view             0.206887
Name: price, dtype: float64

**Save the selected features as a data frame**

- The selecrtedf features are saved in a new data frame which will be used for training the predictive model

In [5]:
selected_columns = top_corr.index
df_selected = df[selected_columns]

df_selected.to_csv('/workspaces/First-data-project/Kaggle Project/Data/Cleaned Data/df_selected.csv')



**Split off validation data to test the model on unseen data**

- The below code allows for the model to be tested on data that has not been seen by the model when training. This will provide a more accurate meausre of how well the model works with new data

In [6]:
# Split 10% of data off for validation

from sklearn.model_selection import train_test_split

train_set, val_set = train_test_split(df_selected, test_size=0.1, random_state=42)

train_set.to_csv('/workspaces/First-data-project/Kaggle Project/Data/Cleaned Data/train_set.csv')
val_set.to_csv('/workspaces/First-data-project/Kaggle Project/Data/Cleaned Data/val_set.csv')


