In [None]:
#imports

import os
import pandas as pd
from pathlib import Path

In [None]:
#designate the csv file data path
data_load = Path ('Resources/Housing.csv')

# uniform housing
housing = pd.read_csv(data_load,encoding="utf-8")

# Display the first few rows of the original DataFrame
print("Original DataFrame:")
(housing.head())

In [None]:
# List of columns to drop (replace with actual column names)
columns_to_drop = ['sqft_living', 'grade', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']

# Drop the specified columns and create a new DataFrame
housing = housing.drop(columns=columns_to_drop)

# Check for null and NaN values after dropping columns
print("\nNull and NaN values after dropping columns:")
print(housing.isnull().sum())



In [None]:
# Check for duplicates in the DataFrame
duplicates = housing.duplicated()

# Count the number of duplicates
num_duplicates = duplicates.sum()

if num_duplicates > 0:
    print(f"Number of duplicate rows: {num_duplicates}")
    # Display duplicate rows
    duplicate_rows = housing[duplicates]
    print("\nDuplicate rows:")
    print(duplicate_rows)
    
    # Remove duplicates and overwrite the DataFrame
    housing = housing.drop_duplicates()
    print("\nDuplicates removed.")
else:
    print("No duplicates found.")


In [None]:
housing.rename(columns={
    'id': 'property_id',
    'price': 'sale_price',
    'bedrooms': 'num_bedrooms',
    'bathrooms': 'num_bathrooms',
    'sqft_lot': 'lot_size',
    'floors': 'num_floors',
    'waterfront': 'is_waterfront',
    'view': 'view_rating',
    'condition': 'condition_rating',
    'yr_built': 'year_built',
    'yr_renovated': 'year_renovated',
    'zipcode': 'zip_code',
    'lat': 'latitude',
    'long': 'longitude'
}, inplace=True)


In [None]:
# Function to convert date format
def convert_date(date_str):
    # Remove the 'T000000' part and convert to datetime
    date_str = date_str.split('T')[0]
    # Convert to datetime
    date = pd.to_datetime(date_str, format='%Y%m%d')
    # Extract month as a number (zero-padded)
    month = date.strftime('%m')
    # Extract day and year
    day = date.strftime('%d')
    year = date.strftime('%Y')
    # Return formatted date
    return f"{month}/{day}/{year}"

# Assuming the date column in your DataFrame is named 'date'
if 'date' in housing.columns:
    # Apply the function to the date column
    housing['dates'] = housing['date'].apply(convert_date)
    # Drop the original date column if desired
    housing = housing.drop(columns=['date'])
else:
    print("date column not found in the DataFrame.")

# Display the first few rows of the modified DataFrame
print("\nModified DataFrame:")
print(housing.head())


In [None]:
housing.to_csv('Resources/ModifiedHomes.csv', index=False)
print("New DataFrame saved to 'Resources/ModifiedHomes.csv'")


In [None]:
pd.read_csv(data_load,encoding="utf-8")
housing.head(10)

In [None]:
print(housing.dtypes)

In [None]:
housing.columns

In [None]:
# Convert 'price' column to string type and remove dollar sign and commas
housing[' price '] = housing[' price '].astype(str)

#  Remove dollar sign and commas, and strip extra spaces
housing[' price '] = housing[' price '].str.replace('$', '').str.replace(',', '').str.strip()

# Replace commas as decimal points (if needed)
housing[' price '] = housing[' price '].str.replace(',', '.')

#  Convert to numeric
housing[' price '] = pd.to_numeric(housing[' price '], errors='coerce')

# Check for NaN values after conversion
print(housing[' price '].isnull().sum())

#print df to check conversion
housing

In [None]:
#check datatypes to make sure price was converted
housing.dtypes

In [None]:
housing['dates'] = pd.to_datetime(housing['dates'], format='%m/%d/%Y')
housing.sort_values('dates', inplace=True)
housing.set_index('dates', inplace=True)
monthly_price_change = housing.groupby(housing.index.year)[' price '].pct_change()
housing['monthly_price_change'] = monthly_price_change
housing


In [None]:
nan_values = housing[housing['monthly_price_change'].isna()]
housing['monthly_price_change'].fillna(0, inplace= True)
housing.head()

In [None]:
def categorize_renovation(year):
    if year == 0:
        return 'never_renovated'
    elif year > 1900:
        return 'renovated'
    else:
        return 'renovated'

# Create a new column 'renovation_category' based on 'yr_renovated' values
housing['renovation_category'] = housing['year_renovated'].apply(categorize_renovation)

housing

In [None]:
dummies = pd.get_dummies(housing['renovation_category'], drop_first=True, dtype=int)

housing = pd.concat([housing, dummies], axis=1)

housing

In [None]:
housing.reset_index(inplace=True)

In [None]:
housing.dtypes

In [None]:
#drop identification column
house_df = housing.drop(columns= ['property_id', 'year_renovated','renovation_category'] )
house_df.head(10)

In [None]:
def convert_to_string(dt):
    return dt.strftime('%Y-%m-%d') if pd.notnull(dt) else ''

# Apply the custom function to convert datetime to string
house_df['dates'] = house_df['dates'].apply(convert_to_string)

# Display the DataFrame with the datetime column converted to object type
print(house_df)

In [None]:
house_df.shape

In [None]:
house_df.info()

In [None]:
house_df.describe()

In [None]:
print(house_df['dates'])


In [None]:
house_df['dates'].isnull().sum()


In [None]:
def custom_date_conversion(date_str):
    try:
        return pd.to_datetime(date_str)
    except ValueError:
        return pd.NaT

# Apply the custom conversion function to the date column
house_df['dates'] = house_df['dates'].apply(custom_date_conversion)

In [None]:
rows_with_nat = house_df[house_df['dates'].isnull()]
print(rows_with_nat)


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(12, 6))
sns.heatmap(house_df.corr(),
            cmap = 'BrBG',
            fmt = '.2f',
            linewidths = 2,
            annot = True)

In [None]:
plt.scatter(house_df['num_bedrooms'],house_df[' price '])


In [None]:
plt.scatter(house_df['num_bathrooms'],house_df[' price '])


In [None]:
plt.scatter(house_df['lot_size'],house_df[' price '])


In [None]:
plt.scatter(house_df['num_floors'],house_df[' price '])


In [None]:
plt.scatter(house_df['is_waterfront'],house_df[' price '])


In [None]:
plt.scatter(house_df['view_rating'],house_df[' price '])


In [None]:
plt.scatter(house_df['condition_rating'],house_df[' price '])


In [None]:
plt.scatter(house_df['year_built'],house_df[' price '])


In [None]:
plt.scatter(house_df['zip_code'],house_df[' price '])


In [None]:
plt.scatter(house_df['zip_code'],house_df[' price '])


In [None]:
plt.scatter(house_df['latitude'],house_df[' price '])


In [None]:
plt.scatter(house_df['longitude'],house_df[' price '])


In [None]:
plt.scatter(house_df['monthly_price_change'],house_df[' price '])


In [None]:
plt.scatter(house_df['renovated'],house_df[' price '])


In [None]:
plt.scatter(house_df['dates'],house_df[' price '])


In [None]:
sns.pairplot(house_df)


In [None]:
sns.displot(house_df[' price '])


In [None]:
obj = (house_df.dtypes == 'object')
object_cols = list(obj[obj].index)
print("Categorical variables:",len(object_cols))
 
int_ = (house_df.dtypes == 'int')
num_cols = list(int_[int_].index)
print("Integer variables:",len(num_cols))
 
fl = (house_df.dtypes == 'float')
fl_cols = list(fl[fl].index)
print("Float variables:",len(fl_cols))

In [None]:
house_df.isnull().sum()


In [None]:

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
 
X = house_df.drop([' price '], axis=1)
Y = house_df[' price ']
 
# Split the training set into 
# training and validation set
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, train_size=0.8, test_size=0.2, random_state=0)


In [None]:
print("xtrain shape : ", X_train.shape)
print("xtest shape  : ", X_test.shape)
print("ytrain shape : ", Y_train.shape)
print("ytest shape  : ", Y_test.shape)

In [None]:
# Fitting Multi Linear regression model to training model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# predicting the test set results
y_pred = regressor.predict(X_test)


In [None]:
plt.scatter(Y_test,y_pred)


In [None]:
from sklearn import metrics
import numpy as np
print('MAE:', metrics.mean_absolute_error(Y_test, y_pred))
print('MSE:', metrics.mean_squared_error(Y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred)))


In [None]:
results = pd.DataFrame({'Actual': Y_test, 'Predicted': y_pred})
print(results)
