In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import statsmodels.api as sm
import scipy.stats as stats

csv_path = "vPortalQuestions/Data/ML374_S6_Concept_Weather_Cleaned_Data.csv"
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"CSV not found at: {csv_path}")

# Load data and keep relevant columns
df = pd.read_csv(csv_path)
df = df[['global_radiation', 'temperature']].copy()
df = df.dropna()

X = df[['global_radiation']].values
y = df['temperature'].values

print(f"Loaded {len(df)} rows. Columns: {list(df.columns)}")


FileNotFoundError: CSV not found at: vPortalQuestions/Data/ML374_S6_Concept_Weather_Cleaned_Data.csv

In [None]:
# Scatter plot with regression line to check linear relationship
plt.figure(figsize=(7,5))
sns.scatterplot(x='global_radiation', y='temperature', data=df, s=20, alpha=0.6)
sns.regplot(x='global_radiation', y='temperature', data=df, scatter=False, color='gold', line_kws={'linewidth':2})
plt.xlabel('global_radiation')
plt.ylabel('temperature')
plt.title('Scatter: temperature vs global_radiation (with fit line)')
plt.tight_layout()
plt.show()


In [None]:
# Fit linear regression (sklearn) and compute diagnostics
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
residuals = y - y_pred

r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
mae = mean_absolute_error(y, y_pred)

print(f"Coefficient (slope): {model.coef_[0]:.6f}")
print(f"Intercept: {model.intercept_:.6f}")
print(f"R2: {r2:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")

# OLS for statistical summary (p-values, t-stats)
X2 = sm.add_constant(X)
ols_res = sm.OLS(y, X2).fit()
print(ols_res.summary())


In [None]:
# Residuals vs Fitted (Homoscedasticity check)
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.scatter(y_pred, residuals, alpha=0.5, s=20)
plt.axhline(0, color='orange', linestyle='--')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted')

# Histogram of residuals with normal curve
plt.subplot(1,2,2)
sns.histplot(residuals, bins=30, stat='density', color='skyblue', edgecolor='k')
mu, sigma = residuals.mean(), residuals.std()
xs = np.linspace(residuals.min(), residuals.max(), 200)
plt.plot(xs, stats.norm.pdf(xs, mu, sigma), color='orange', lw=2, label='Normal pdf')
plt.title('Residuals Distribution')
plt.xlabel('Error Value')
plt.ylabel('Probability Density')
plt.legend()

plt.tight_layout()
plt.show()

# Q-Q plot for residuals (normality)
plt.figure(figsize=(6,5))
stats.probplot(residuals, dist='norm', plot=plt)
plt.title('Q-Q plot of residuals')
plt.tight_layout()
plt.show()


In [None]:
# Actual vs Predicted with 45-degree reference line
plt.figure(figsize=(6,6))
plt.scatter(y, y_pred, alpha=0.6, s=20)
mn = min(y.min(), y_pred.min())
mx = max(y.max(), y_pred.max())
plt.plot([mn, mx], [mn, mx], '--', color='orange')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs. Predicted')
plt.tight_layout()
plt.show()
