## Import data


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import great_circle

In [None]:
if not os.path.exists('usa_housing.csv') or not os.path.exists('ny_housing.csv'):
    # !wget "https://drive.usercontent.google.com/download?id=1fqh567slCa7vPhHGIQ1g7szImts_71mx&export=download&authuser=0&confirm=t&uuid=ba0d436c-a9f8-4c30-9571-54437683395b&at=APZUnTWagNdHJ7SK1BxfNQpSmOpr:1700065030553" -O 'usa_housing.csv'
    !wget "https://drive.usercontent.google.com/download?id=1nUBrPk8KXWKNMpNaefK4zb5M-gTERsnM&export=download&authuser=0&confirm=t&uuid=3d809786-7414-4548-af2b-c93a9a24f96c&at=APZUnTXoshoXpG5egZCmjGFej0qo:1701765700250" -O 'ny_housing.csv'

# df_usa = pd.read_csv('usa_housing.csv')
df_ny_og = pd.read_csv('ny_housing.csv')

In [None]:
df_ny_filtered = df_ny_og.copy()

## Clean up lines, columns, and convert





In [None]:
columns_to_remove = ['property_url','property_id','address','apartment','street_name','postcode','broker_id','year_build','total_num_units','agent_name','agent_phone','agency_name','RunDate','is_owned_by_zillow','listing_age','property_type','property_status','state','city','price_per_unit']

df_ny_filtered.drop(columns_to_remove,inplace=True,axis=1) #drops the useless columns

df_ny_filtered.dropna(inplace=True) #remove NaN vlaues

df_ny_filtered = df_ny_filtered.reset_index(drop=True) #fixes the indexes

In [None]:
print(df_ny_filtered.info())

#conversion of acres and sqft
for i in range(len(df_ny_filtered['land_space_unit'])):
  if df_ny_filtered['land_space_unit'][i] == "acres":
    df_ny_filtered['land_space'][i]*= 43560

df_ny_filtered.drop(['land_space_unit'],inplace=True,axis=1)
df_ny_filtered = df_ny_filtered.reset_index(drop=True) #fixes the indexes

print(df_ny_filtered.info())

# lat and long conversion

# convert lat and lon to distance from nyc
nyc_center = (40.7831, -73.9712)

def calculate_distance(row):
    property_location = (row['latitude'], row['longitude'])
    return great_circle(property_location, nyc_center).miles

df_ny_filtered['distance_to_nyc'] = df_ny_filtered.apply(calculate_distance, axis=1)

columns_to_remove = ['latitude','longitude']

df_ny_filtered.drop(columns_to_remove,inplace=True,axis=1) #drops the useless columns
df_ny_filtered = df_ny_filtered.reset_index(drop=True) #fixes the indexes

In [None]:
df_ny = df_ny_filtered.copy()

## Visualize Data

In [None]:
# Graph features

features = ['bedroom_number', 'bathroom_number', 'living_space', 'land_space','distance_to_nyc']
sns.set_style('whitegrid')

for feature in features:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df_ny, x=feature, y='price')
    plt.title(f'Price vs {feature}')
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.show()

## IQR function

In [None]:
def remove_outliers(df, feature, lq, uq, multiplier=1.5):
    Q1 = df[feature].quantile(lq)
    Q3 = df[feature].quantile(uq)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    return df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]

def train_and_evaluate(df):
    X = df.drop("price", axis=1)
    y = df["price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

def find_best_quantiles(df, features):
    best_mse = float('inf')
    best_r2 = float('-inf')
    best_lq = best_uq = 0

    for lq in np.arange(0.05, 0.5, 0.05):
        for uq in np.arange(0.5, 0.95, 0.05):
                temp_df = df.copy()
                for feature in features:
                    temp_df = remove_outliers(temp_df, feature, lq, uq)
                mse, r2 = train_and_evaluate(temp_df)
                if mse < best_mse or r2 > best_r2:
                    best_mse, best_r2 = mse, r2
                    best_lq, best_uq = lq, uq


    return best_mse, best_r2, best_lq, best_uq


best_mse, best_r2, best_lq, best_uq = find_best_quantiles(df_ny, features)
for_mse = "{:,.2f}".format(best_mse)
print(f"Best MSE: {for_mse}, Best R2: {best_r2}")
print(f"Best Lower Quantile: {best_lq}, Best Upper Quantile: {best_uq}")


## Clean Data - Delete outliers using IQR

In [None]:
features = ['bedroom_number', 'bathroom_number', 'living_space', 'land_space','distance_to_nyc']

valid_indices = set(df_ny.index)

for feature in features:
    Q1 = df_ny[feature].quantile(best_lq)
    Q3 = df_ny[feature].quantile(best_uq)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    feature_valid_indices = df_ny[(df_ny[feature] >= lower_bound) & (df_ny[feature] <= upper_bound)].index
    valid_indices = valid_indices.intersection(feature_valid_indices)

temp = df_ny.loc[list(valid_indices)]

sns.set_style('whitegrid')

for feature in features:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=temp, x=feature, y='price')
    plt.title(f'Price vs {feature}')
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.show()


In [None]:
df_ny = temp

## Housing Model

In [None]:
def makeModel(dataframe):
  X = dataframe.drop("price", axis=1)
  y = dataframe["price"]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  model = LinearRegression()
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  formatted_mse = "{:,.2f}".format(mse)

  print(f"Mean Squared Error: {formatted_mse}")
  print(f"R^2 Score: {r2}")

  return model

In [None]:
mode = makeModel(df_ny)

## City Conversion

In [None]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="appriase")

def geocode_city(city_name):
    try:
        location = geolocator.geocode(city_name + ', New York')
        return location.latitude, location.longitude
    except Exception as e:
        print("Invalid city name try again")

## User Input

In [None]:
bed   =   float(input("Enter how many beds: "))
bath  =   float(input("Enter how many bathrooms: "))
land =   float(input("Enter how many sqft the land is: "))
house  =   float(input("Enter how the size of the house in square feet: "))
city = input("Enter the city in the State of New York: ")

lat,lon = geocode_city(city)
distance =  great_circle((lat,lon), nyc_center).miles

data = {
    'bedroom_number': [bed],
    'bathroom_number': [bath],
    'living_space':[house],
    'land_space': [land],
    'distance_to_nyc': [distance]
}

input_df = pd.DataFrame(data)

prediction = list(model.predict(input_df))

# Print the predictions
print("Predicted Price:")
print(f"${prediction[0]:.2f}")