In [12]:
import pandas as pd

df = pd.read_csv('kc_house_data.csv')

print(df.columns.tolist())

df_cleaned = df.drop(columns=['id', 'date', 'zipcode'])

print("\n" + "="*70)
print("Part 1: AVERAGE, MIN, MAX, AND VARIANCE FOR EACH FEATURE")
print("\n" + "="*70)

# calculate statistices for all of the features
stats = pd.DataFrame({
    'Feature': df_cleaned.columns,
    'Mean': df_cleaned.mean(),
    'Min': df_cleaned.min(),
    'Max': df_cleaned.max(),
    'Variance': df_cleaned.var()
})

pd.options.display.float_format = '{:.2f}'.format
print("\nFeature Statistics table")
print(stats.to_string(index=False))

['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

Part 1: AVERAGE, MIN, MAX, AND VARIANCE FOR EACH FEATURE


Feature Statistics table
      Feature      Mean      Min        Max        Variance
        price 540088.14 75000.00 7700000.00 134782378397.25
     bedrooms      3.37     0.00      33.00            0.87
    bathrooms      2.11     0.00       8.00            0.59
  sqft_living   2079.90   290.00   13540.00       843533.68
     sqft_lot  15106.97   520.00 1651359.00   1715658774.18
       floors      1.49     1.00       3.50            0.29
   waterfront      0.01     0.00       1.00            0.01
         view      0.23     0.00       4.00            0.59
    condition      3.41     1.00       5.00            0.42
        grade      7.66     1.00      13.00            1.38
   sqft_a

In [None]:
pd.options.display.float_format = '{:.4f}'.format

lowest_avg_idx = stats['Mean'].idxmin()
highest_avg_idx = stats['Mean'].idxmax()

print("\n" + "-"*70)
print("ANSWERS FOR PART 1 QUESTIONS")
print("-"*70)

print(f"\n1. Feature with the LOWEST average:")
print(f"    Feature: {stats.loc[lowest_avg_idx, 'Feature']}")
print(f"    Average value: {stats.loc[lowest_avg_idx, 'Mean']:.4f}")

print(f"\n2. Feature with the HIGHEST average:")
print(f"    Feature: {stats.loc[highest_avg_idx, 'Feature']}")
print(f"    Average value: {stats.loc[highest_avg_idx, 'Mean']:.4f}")

lowest_variance_idx = stats['Variance'].idxmin()
highest_variance_idx = stats['Variance'].idxmax()

print(f"\n3. Feature with the LOWEST variance")
print(f"    Feature: {stats.loc[lowest_variance_idx, 'Feature']}")
print(f'    Variance: {stats.loc[lowest_variance_idx, 'Variance']:.4e}')

print(f"\n4. Feature with the HIGHEST variance")
print(f"    Feature: {stats.loc[highest_variance_idx, 'Feature']}")
print(f"    Variance: {stats.loc[highest_variance_idx, 'Variance']:.4f}")


----------------------------------------------------------------------
ANSWERS FOR PART 1 QUESTIONS
----------------------------------------------------------------------

1. Feature with the LOWEST average:
    Feature: long
    Average value: -122.2139

2. Feature with the HIGHEST average:
    Feature: price
    Average value: 540088.1418

3. Feature with the LOWEST variance
    Feature: waterfront
    Variance: 7.4852e-03

4. Feature with the HIGHEST variance
    Feature: price
    Variance: 134782378397.2469


In [None]:
import numpy as np

print("\n"+"="*70)
print("PART 2: CORREALATION WITH RESPONSE (PRICE)")
print("="*70)

# Seperate features from response variable
response = 'price'
features = [col for col in df_cleaned if col != response]

correlations = []
# Calculate correlation of each feature with price
for feature in features:
    corr = df_cleaned[feature].corr(df_cleaned[response])
    correlations.append({
        'Feature': feature,
        'Correlation': corr
    })

corr_df = pd.DataFrame(correlations)

corr_df_sorted = corr_df.sort_values('Correlation', ascending=False)

print("\nCorrelation Coeficcient of Each Feature with Price")
print(corr_df_sorted.to_string(index=False))

print("\n" + "-"*70)
print("ANSWERS TO PART 2 QUESTIONS:")
print("-"*70)

positive_corr = corr_df[corr_df['Correlation'] > 0].sort_values('Correlation', ascending=False)

print(f"\n1. Features with POSITIVE correlation (total: {len(positive_corr)}):")
for idx, row in positive_corr.iterrows():
    print(f"   - {row['Feature']}: {row['Correlation']:.4f}")

highest_corr_idx = corr_df['Correlation'].idxmax()

print(f"\n2. Feature with HIGHEST positive correlation:")
print(f"   Feature: {corr_df.loc[highest_corr_idx, 'Feature']}")
print(f"   Correlation: {corr_df.loc[highest_corr_idx, 'Correlation']:.4f}")

print("\n" + "="*70)
print("PART 3: NEGATIVE CORRELATION WITH RESPONSE")
print("="*70)

negative_corr = corr_df[corr_df['Correlation'] < 0].sort_values('Correlation')

if len(negative_corr) > 0:
    print(f"\nYES - Found {len(negative_corr)} feature(s) with negative correlation:")
    for idx, row in negative_corr.iterrows():
        print(f"   - {row['Feature']}: {row['Correlation']:.4f}")
else:
    print("\nNO - No features with negative correlation were found.")
    print("All features have positive correlation with price.")


PART 2: CORREALATION WITH RESPONSE (PRICE)

Correlation Coeficcient of Each Feature with Price
      Feature  Correlation
  sqft_living       0.7020
        grade       0.6674
   sqft_above       0.6056
sqft_living15       0.5854
    bathrooms       0.5251
         view       0.3973
sqft_basement       0.3238
     bedrooms       0.3083
          lat       0.3070
   waterfront       0.2664
       floors       0.2568
 yr_renovated       0.1264
     sqft_lot       0.0897
   sqft_lot15       0.0824
     yr_built       0.0540
    condition       0.0364
         long       0.0216

----------------------------------------------------------------------
ANSWERS TO PART 2 QUESTIONS:
----------------------------------------------------------------------

1. Features with POSITIVE correlation (total: 17):
   - sqft_living: 0.7020
   - grade: 0.6674
   - sqft_above: 0.6056
   - sqft_living15: 0.5854
   - bathrooms: 0.5251
   - view: 0.3973
   - sqft_basement: 0.3238
   - bedrooms: 0.3083
   - lat: