In [33]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import plotly.graph_objects as go


In [5]:
df = pd.read_csv('C:/Users/dsimo/Downloads/GreenScape-main/GreenScape/EDA/Simona/scripts/Preprocessed_Dataset.csv',index_col=0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178528 entries, 2751492 to 455607
Data columns (total 25 columns):
 #   Column                                                                         Non-Null Count   Dtype  
---  ------                                                                         --------------   -----  
 0   Year                                                                           178528 non-null  int64  
 1   date                                                                           178528 non-null  object 
 2   regions                                                                        178528 non-null  object 
 3   Neighborhood                                                                   178528 non-null  object 
 4   green_score                                                                    178528 non-null  float64
 5   livability_score_x                                                             178528 non-null  float64
 6   TotalH

In [7]:
df_numeric = df.select_dtypes(exclude=object)

In [8]:
df_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178528 entries, 2751492 to 455607
Data columns (total 22 columns):
 #   Column                                                                         Non-Null Count   Dtype  
---  ------                                                                         --------------   -----  
 0   Year                                                                           178528 non-null  int64  
 1   green_score                                                                    178528 non-null  float64
 2   livability_score_x                                                             178528 non-null  float64
 3   TotalHouses                                                                    178528 non-null  int64  
 4   Population                                                                     178528 non-null  int64  
 5   working_population                                                             178528 non-null  int64  
 6   Income

In [40]:
def compare_linear_regression(target_col, df):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("Linear Regression Metrics:")
    print("Mean Squared Error:", mse)
    print("R-squared:", r2)

    return mse, r2, y_pred


In [41]:
def compare_random_forest(target_col, df):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("Random Forest Metrics:")
    print("Mean Squared Error:", mse)
    print("R-squared:", r2)

    return mse, r2, y_pred

In [43]:
lr_mse, lr_r2, y_pred_lr = compare_linear_regression('green_score', df_numeric)
rf_mse, rf_r2, y_pred_rf =compare_random_forest('green_score', df_numeric)

Linear Regression Metrics:
Mean Squared Error: 98.79459637867966
R-squared: 0.1309718494362302
Random Forest Metrics:
Mean Squared Error: 41.59625958234667
R-squared: 0.6341062987224452


In [47]:
def tuning(target_col, df):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    param_dist = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [None, 5, 10],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
    }
    
    rf_model = RandomForestRegressor(random_state=42)
    random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, cv=5, scoring='neg_mean_squared_error', n_iter=10, random_state=42)
    random_search.fit(X_train, y_train)
    
    best_params = random_search.best_params_
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test,y_pred)
    
    print("Best Hyperparameters:", best_params)
    print("Best Model MSE:", mse)
    
    return mse, r2, y_pred

In [48]:
mse_tuned, r2_tuned, y_pred_tuned = tuning('green_score',df_numeric)

Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': None}
Best Model MSE: 37.96744085994352


In [49]:
X = df_numeric.drop('green_score', axis=1)
y = df_numeric['green_score']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scatter_linear_reg = go.Scatter(
    x=y_test,
    y=y_pred_lr,
    mode='markers',
    name='Linear Regression',
    marker=dict(
        color='blue',
        symbol='circle',
        size=6
    )
)

# Create scatter plot after hyperparameter tuning
scatter_random_for = go.Scatter(
    x=y_test,
    y=y_pred_rf,
    mode='markers',
    name='Random Forest',
    marker=dict(
        color='green',
        symbol='circle',
        size=6
    )
)
scatter_tuned = go.Scatter(
    x=y_test,
    y=y_pred_tuned,
    mode='markers',
    name='After RandomizedSearchCV',
    marker=dict(
        color='purple',
        symbol='circle',
        size=6
    )
)
# Create a diagonal line for reference
diagonal_line = go.Scatter(
    x=[min(y_test), max(y_test)],
    y=[min(y_test), max(y_test)],
    mode='lines',
    name='Perfect Prediction',
    line=dict(
        color='red',
        dash='dash'
    )
)

# Create layout
layout = go.Layout(
    title='Performance Comparison',
    xaxis=dict(title='Actual Values'),
    yaxis=dict(title='Predicted Values'),
    showlegend=True
)

# Create figure
fig = go.Figure(data=[scatter_linear_reg, scatter_random_for, scatter_tuned, diagonal_line], layout=layout)

# Show the figure
fig.show()

In [52]:
def plot_performance_bar(linear_mse, linear_r2, rf_mse, rf_r2, mse_tuned, r2_tuned):
    models = ['Linear Regression', 'Random Forest','After tuning Random Forest']
    mse_scores = [linear_mse, rf_mse, mse_tuned]
    r2_scores = [linear_r2, rf_r2, r2_tuned]

    mse_trace = go.Bar(
        x=models,
        y=mse_scores,
        name='Mean Squared Error (MSE)',
        text=mse_scores,
        textposition='auto'
    )

    r2_trace = go.Bar(
        x=models,
        y=r2_scores,
        name='R-squared (R2)',
        text=r2_scores,
        textposition='auto',
    )

    layout = go.Layout(
        title='Model Performance',
        xaxis=dict(title='Models'),
        yaxis=dict(title='Performance'),
        showlegend=True
    )

    fig = go.Figure(data=[mse_trace, r2_trace], layout=layout)

    # Show the figure
    fig.show()
plot_performance_bar(lr_mse, lr_r2, rf_mse, rf_r2, mse_tuned, r2_tuned)


In [53]:
import plotly.graph_objects as go

def plot_future_predictions(df, model):
    last_known_year = df['Year'].max()

    future_years = range(last_known_year + 1, last_known_year + 6)

    future_predictions = model.predict(df.drop('Year', axis=1).tail(1).repeat(5))

    # Create the line chart trace
    line_trace = go.Scatter(
        x=future_years,
        y=future_predictions,
        mode='lines',
        name='Predicted Values',
        line=dict(color='blue')
    )

    # Create the layout
    layout = go.Layout(
        title='Predicted Values for Future Years',
        xaxis=dict(title='Year'),
        yaxis=dict(title='Predicted Values')
    )

    # Create the figure
    fig = go.Figure(data=[line_trace], layout=layout)

    # Show the figure
    fig.show()


In [54]:
plot_future_years(y_pred_tuned, df_numeric)