# Kadin McWilliams
# Version 1
# Spring 2024
# Numpy to analyze the heart data

In [1]:
# Loading the package
import numpy as np

### Loading the housing dataset from CSV file

In [3]:
data=np.genfromtxt("Housing_new.csv", delimiter=",", skip_header=True)

In [4]:
data

array([[1.330e+07, 7.420e+03, 4.000e+00, 2.000e+00, 2.000e+00],
       [1.225e+07, 8.960e+03, 4.000e+00, 4.000e+00, 3.000e+00],
       [1.225e+07, 9.960e+03, 3.000e+00, 2.000e+00, 2.000e+00],
       ...,
       [1.750e+06, 3.620e+03, 2.000e+00, 1.000e+00, 0.000e+00],
       [1.750e+06, 2.910e+03, 3.000e+00, 1.000e+00, 0.000e+00],
       [1.750e+06, 3.850e+03, 3.000e+00, 1.000e+00, 0.000e+00]])

### Extracting relevent columns (eg. prices, area...)

In [12]:
price=data[:,0]
print(f"price: {price[:5]}")
area=data[:,1]
print(f"area: {area[:5]}")
bedrooms=data[:,2]
print(f"bedrooms: {bedrooms[:5]}")
bathrooms=data[:,3]
print(f"bathrooms: {bathrooms[:5]}")
parking=data[:,-1]
print(f"parking: {parking[:5]}")

price: [13300000. 12250000. 12250000. 12215000. 11410000.]
area: [7420. 8960. 9960. 7500. 7420.]
bedrooms: [4. 4. 3. 4. 4.]
bathrooms: [2. 4. 2. 2. 1.]
parking: [2. 3. 2. 3. 2.]


### Basic Descriptive statistics
* Mean, Median, SD, Min, Max

In [21]:
# price
print("---Basic Descriptive Statistics on Price---")
print(f"Mean: {round(np.mean(price))}")
print(f"Median: {np.median(price)}")
print(f"Standard Deviation: {round(np.std(price))}")
print(f"Min: {np.min(price)}")
print(f"Max: {np.max(price)}")

---Basic Descriptive Statistics on Price---
Mean: 4766729
Median: 4340000.0
Standard Deviation: 1868723
Min: 1750000.0
Max: 13300000.0


In [22]:
# area
print("---Basic Descriptive Statistics on Area---")
print(f"Mean: {round(np.mean(area))}")
print(f"Median: {np.median(area)}")
print(f"Standard Deviation: {round(np.std(area))}")
print(f"Min: {np.min(area)}")
print(f"Max: {np.max(area)}")

---Basic Descriptive Statistics on Area---
Mean: 5151
Median: 4600.0
Standard Deviation: 2168
Min: 1650.0
Max: 16200.0


### Data filtering

In [24]:
print(f"Number of rows before filtering: {len(data)}")

Number of rows before filtering: 545


In [25]:
# How many houses with more than 3 bedrooms and more than 2 bathrooms
filtered_data=data[(bedrooms>3) & (bathrooms>2)]
print(f"Number of rows after filtering: {len(filtered_data)}")

Number of rows after filtering: 7


### How many entry in the dataset where 
* area more than 8000 square feet and 
* atleast 3 bedrooms and 
* atleast 2 parking spaces

In [26]:
filtered_data1=data[(bedrooms>=3) & (parking>=2)&(area>8000)]
print(f"Number of rows after filtering: {len(filtered_data1)}")

Number of rows after filtering: 25


### Number of unique categories
* Unique can work for discreet variables

In [30]:
print(f"Unique bedroom counts: {np.unique(bedrooms)}")
print(f" How many unique bedrooms: {len(np.unique(bedrooms))}")

Unique bedroom counts: [1. 2. 3. 4. 5. 6.]
 How many unique bedrooms: 6


In [31]:
print(f"Unique bathrooms counts: {np.unique(bathrooms)}")
print(f" How many unique bathrooms: {len(np.unique(bathrooms))}")

Unique bathrooms counts: [1. 2. 3. 4.]
 How many unique bathrooms: 4


In [32]:
print(f"Unique parking space counts: {np.unique(parking)}")
print(f" How many unique parking: {len(np.unique(parking))}")

Unique parking space counts: [0. 1. 2. 3.]
 How many unique parking: 4


### Correlation 

In [35]:
# Default rowvar= True ( calculate the corelation for each rows)
# rowvar= False (each column represent a variable)

corr_house=np.corrcoef(price,area,rowvar=False)
print(f"Corr price and area {corr_house}")

Corr price and area [[1.         0.53599735]
 [0.53599735 1.        ]]


###  Correlation Analysis
- Price and area: 54% (strong positive)
- Price and bedrooms: 37% (weak positive) 
- Price and bathrooms: 52% (strong positive)
- Price parking : 38%(weak positive)