In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Load dataset
print("Loading dataset...")
dataset = pd.read_csv('house price data.csv')
print("Dataset loaded successfully.\n")

Loading dataset...
Dataset loaded successfully.



In [5]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(dataset.head(), "\n")

First few rows of the dataset:
                  date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  2014-05-02 00:00:00   313000.0       3.0       1.50         1340      7912   
1  2014-05-02 00:00:00  2384000.0       5.0       2.50         3650      9050   
2  2014-05-02 00:00:00   342000.0       3.0       2.00         1930     11947   
3  2014-05-02 00:00:00   420000.0       3.0       2.25         2000      8030   
4  2014-05-02 00:00:00   550000.0       4.0       2.50         1940     10500   

   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
0     1.5           0     0          3        1340              0      1955   
1     2.0           0     4          5        3370            280      1921   
2     1.0           0     0          4        1930              0      1966   
3     1.0           0     0          4        1000           1000      1963   
4     1.0           0     0          4        1140            800      1976   

   yr_r

In [7]:
# Data Preprocessing
print("Preprocessing data...")

Preprocessing data...


In [9]:
# Convert the 'date' column to datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

In [11]:
# Extract year, month, and day from the date
dataset['year'] = dataset['date'].dt.year
dataset['month'] = dataset['date'].dt.month
dataset['day'] = dataset['date'].dt.day

In [13]:
# Drop the original 'date' column
dataset = dataset.drop('date', axis=1)

In [15]:
# Remove null values
dataset = dataset.dropna()

In [17]:
# Remove duplicates
dataset = dataset.drop_duplicates()

In [19]:
# Drop non-numeric columns that aren't useful for prediction
dataset = dataset.drop(['street', 'city', 'statezip', 'country'], axis=1)

In [21]:
print("Data preprocessing completed.")
print("Dataset after preprocessing:")
print(dataset.head(), "\n")

Data preprocessing completed.
Dataset after preprocessing:
       price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  \
0   313000.0       3.0       1.50         1340      7912     1.5           0   
1  2384000.0       5.0       2.50         3650      9050     2.0           0   
2   342000.0       3.0       2.00         1930     11947     1.0           0   
3   420000.0       3.0       2.25         2000      8030     1.0           0   
4   550000.0       4.0       2.50         1940     10500     1.0           0   

   view  condition  sqft_above  sqft_basement  yr_built  yr_renovated  year  \
0     0          3        1340              0      1955          2005  2014   
1     4          5        3370            280      1921             0  2014   
2     0          4        1930              0      1966             0  2014   
3     0          4        1000           1000      1963             0  2014   
4     0          4        1140            800      1976          

In [23]:
# Ensure all columns are numeric
print("Data types of columns:")
print(dataset.dtypes, "\n")

Data types of columns:
price            float64
bedrooms         float64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
year               int32
month              int32
day                int32
dtype: object 



In [25]:
# Assuming 'price' is the column to predict and the rest are features
X = dataset.drop('price', axis=1)
y = dataset['price']


In [27]:
# Split the dataset into training (80%) and testing (20%) data
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split completed.\n")

Splitting data into training and testing sets...
Data split completed.



In [29]:
# Display the shape of training and testing sets
print("Shape of training data:", X_train.shape)
print("Shape of testing data:", X_test.shape, "\n")

Shape of training data: (3680, 15)
Shape of testing data: (920, 15) 



In [31]:
# Check the types of data in X_train to ensure they are all numeric
print("Data types of training features:")
print(X_train.dtypes, "\n")

Data types of training features:
bedrooms         float64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
year               int32
month              int32
day                int32
dtype: object 



In [33]:
# Implement the Model
print("Training the linear regression model...")
model = LinearRegression()
model.fit(X_train, y_train)
print("Model training completed.\n")

Training the linear regression model...
Model training completed.



In [35]:
# Predict on the training data
print("Predicting on the training data...")
train_predictions = model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)
train_r2 = r2_score(y_train, train_predictions)
print(f"Training Mean Squared Error: {train_mse}")
print(f"Training R^2 Score: {train_r2}\n")

Predicting on the training data...
Training Mean Squared Error: 64732206534.48784
Training R^2 Score: 0.5444149716609681



In [37]:

# Model Evaluation
print("Evaluating the model on the test data...")
test_predictions = model.predict(X_test)
test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)
print(f"Test Mean Squared Error: {test_mse}")
print(f"Test R^2 Score: {test_r2}\n")

Evaluating the model on the test data...
Test Mean Squared Error: 986454524777.5083
Test R^2 Score: 0.0327420063169509



In [39]:
# Testing with the same data (example)
print("Testing with the same data (reloading and preprocessing)...")
new_data = pd.read_csv('house price data.csv')
new_data['date'] = pd.to_datetime(new_data['date'])
new_data['year'] = new_data['date'].dt.year
new_data['month'] = new_data['date'].dt.month
new_data['day'] = new_data['date'].dt.day
new_data = new_data.drop(['date', 'street', 'city', 'statezip', 'country'], axis=1)
new_data = new_data.dropna()
new_data = new_data.drop_duplicates()
new_data = new_data[X.columns]
new_predictions = model.predict(new_data)
print("Predictions on the reloaded data:")
print(new_predictions)

Testing with the same data (reloading and preprocessing)...
Predictions on the reloaded data:
[ 354717.69153166 1257180.32212008  487026.06104154 ...  743174.47127322
  412448.39697367  390260.26230737]
