In [2]:
%pip install scikit-learn

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [5]:
#Loading the data
df = pd.read_csv('./every_state.csv')
state_data = df[df['RegionName']== 'Alabama'].copy()
state_data['Date'] = pd.to_datetime(state_data['Year'].astype(str) + '-' + state_data['Month'].astype(str))


In [None]:
#Converting the Date into a numeric format for regression
#linear regression models can't directly interpret datetime objects. They require numerical values as input.
# adding a sequential number to each row
# turns the date into a simple sequence of numbers: 0, 1, 2, 3, etc., where each number corresponds to a specific date. 
for state in df['RegionName'].unique():
    state_data = df[df['RegionName'] == state]
    state_data['Date'] = pd.to_datetime(state_data)
    
    X = state_data[['Time']]
    y = state_data['HousePrice']

    # Split data 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Creating and training the model using scikit-learn LinearRegression
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions
    future_times = [[len(state_data) + i] for i in range(12)]
    predicted_prices = model.predict(future_times)

    # Plot results 
    plt.figure(figsize=(12,6))
    plt.scatter(X, y, color='blue', label='Actual Prices')
    plt.plot(X, model.predict(X), color='red', label='Regression Line')
    plt.scatter(future_times, predicted_prices, color='green', label='Future Predictions')
    plt.xlabel('Time (Months since start)')
    plt.ylabel('House Price')
    plt.title('House Price Prediction for Alabama')
    plt.legend()
    plt.show()


