In [3]:
import numpy as np  # Import NumPy for numerical operations (arrays, math functions, etc.)
import pandas as pd  # Import pandas for data manipulation and analysis (tables, DataFrames)
import matplotlib.pyplot as plt  # Import matplotlib for plotting and visualizing data
from sklearn.datasets import fetch_california_housing  # Function to load the California housing dataset (sample data)
from sklearn.model_selection import train_test_split  # Function to split data into training and testing sets
from sklearn.linear_model import LinearRegression  # Linear Regression model for prediction
from sklearn.metrics import mean_squared_error, r2_score  # Metrics to evaluate model performance (error and accuracy)

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load California housing dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Display the first 5 rows
df.head()

In [None]:
# Check for missing values
print(df.isnull().sum())

# Basic statistics
print(df.describe())

In [None]:
# Let's predict 'MedHouseVal' (median house value) using 'MedInc' (median income)
X = df[['MedInc']]  # Feature(s)
y = df['MedHouseVal']  # Target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

In [None]:
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.title('Linear Regression: House Value vs. Income')
plt.legend()
plt.show()