In [3]:
# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [5]:
# Step 2: Create a synthetic dataset for example
# Here, 'square_footage', 'bedrooms', and 'bathrooms' are the features, and 'price' is the target variable
data = {
    'square_footage': [1500, 2000, 2500, 1800, 2200, 3000, 3500, 4000, 1300, 1700],
    'bedrooms': [3, 4, 4, 3, 4, 5, 5, 6, 2, 3],
    'bathrooms': [2, 3, 3, 2, 3, 4, 4, 5, 1, 2],
    'price': [300000, 400000, 500000, 350000, 450000, 600000, 700000, 800000, 250000, 320000]
}

# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame(data)

# Print the first few rows of the dataset to inspect it
print(df.head())


   square_footage  bedrooms  bathrooms   price
0            1500         3          2  300000
1            2000         4          3  400000
2            2500         4          3  500000
3            1800         3          2  350000
4            2200         4          3  450000


In [7]:
# Step 3: Define the features (independent variables) and target (dependent variable)
# Features are 'square_footage', 'bedrooms', and 'bathrooms'
X = df[['square_footage', 'bedrooms', 'bathrooms']]

# The target variable is 'price'
y = df['price']

# Print the features and target to ensure correctness
print("Features (X):\n", X.head())
print("Target (y):\n", y.head())


Features (X):
    square_footage  bedrooms  bathrooms
0            1500         3          2
1            2000         4          3
2            2500         4          3
3            1800         3          2
4            2200         4          3
Target (y):
 0    300000
1    400000
2    500000
3    350000
4    450000
Name: price, dtype: int64


In [9]:
# Step 4: Split the data into training and testing sets
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)


Training set shape (X_train, y_train): (8, 3) (8,)
Testing set shape (X_test, y_test): (2, 3) (2,)


In [11]:
# Step 5: Initialize the linear regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Print the model's learned coefficients (slope values) and intercept (y-intercept)
print("Coefficients (slopes):", model.coef_)
print("Intercept (y-intercept):", model.intercept_)


Coefficients (slopes): [  178.38164251 10326.08695652 10326.08695652]
Intercept (y-intercept): -22777.77777777787


In [13]:
# Step 6: Make predictions on the test data
y_pred = model.predict(X_test)

# Print the predicted values and the actual values
print("Predicted prices:", y_pred)
print("Actual prices:", y_test.values)


Predicted prices: [240096.61835749 406268.11594203]
Actual prices: [250000 400000]


In [15]:
# Step 7: Evaluate the model using Mean Squared Error (MSE) and R-squared (R²)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("R-squared (R²):", r2)


Mean Squared Error (MSE): 68683122.70998244
R-squared (R²): 0.9877896670737809


In [23]:
import numpy as np
import pandas as pd

# Ensure the column names match the ones used during training
new_house = pd.DataFrame([[2500, 3, 2]], columns=['square_footage', 'bedrooms', 'bathrooms'])

# Make the prediction
predicted_price = model.predict(new_house)

# Print the predicted price
print("Predicted price for the new house:", predicted_price[0])


Predicted price for the new house: 474806.7632850241
