In [1]:
import pandas as pd
from operations.adjust_dataset import adjust_dataset
from operations.lin import LinearRegressionModel
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('docs/data/GlobalWeatherRepository.csv')

df = adjust_dataset(df, 'Kabul')
window_size = 7
output_size = 3
# Create lagged features for temperature_celsius
for lag in range(1, window_size + 1):  # Create lags up to the window size
    df[f'temp_lag_{lag}'] = df['temperature_celsius'].shift(lag)

# Function to create sequences for a single feature (temperature_celsius)
def create_temp_sequences(data, window_size, output_size):
    X, y = [], []
    for i in range(len(data) - window_size - output_size + 1):
        X.append(data[i:(i + window_size)])
        y.append(data[i + window_size:i + window_size + output_size])
    return np.array(X), np.array(y)

# Create sequences using only temperature_celsius
temp_data = df['temperature_celsius']
X_seq, y_seq = create_temp_sequences(temp_data, window_size, output_size)


# Sequential Train-Test Split
train_size_seq = int(len(X_seq) * 0.7)
X_train_seq, X_test_seq = X_seq[:train_size_seq], X_seq[train_size_seq:]
y_train_seq, y_test_seq = y_seq[:train_size_seq], y_seq[train_size_seq:]

# Create weights for the training set - higher weights for more recent data
# The weights increase linearly from a starting value (e.g., 1) to an end value (e.g., 10)
weights = np.linspace(1, 100, num=len(X_train_seq))


# Train the model with these weights
model = LinearRegression(copy_X = True, fit_intercept = False, n_jobs = None, positive = True)
model.fit(X_train_seq, y_train_seq, sample_weight=weights)

# Model is now trained on past temperature data to predict future temperatures
# Make predictions on the test set
y_pred_seq = model.predict(X_test_seq)

# Evaluate the model
mse_seq = mean_squared_error(y_test_seq, y_pred_seq)


# Print the evaluation results
print(f"Mean Squared Error (MSE) on Test Data: {mse_seq}")
rnse = np.sqrt(mse_seq)
print(rnse)



Mean Squared Error (MSE) on Test Data: 2.220458029929816
1.4901201394283


Random tree forest search

In [26]:
df = pd.read_csv('docs/data/GlobalWeatherRepository.csv')

# Filtering the DataFrame to keep only the specified cities
cities_to_keep = ['Tokyo', 'Madrid', 'Stockholm', 'Paris']
df_filtered = df[df['location_name'].isin(cities_to_keep)]

# Drop the 'country' column from df_filtered
df_filtered = df_filtered.drop(columns=['country', 'timezone', 'last_updated', 'condition_text', 'wind_direction', 'last_updated_epoch', 'wind_degree',
                                         'air_quality_us-epa-index', 'air_quality_gb-defra-index', 'moon_illumination','sunrise', 'sunset', 'moonset', 'moonrise', 'moon_phase'])
# Now apply one-hot encoding
df_encoded = pd.get_dummies(df_filtered, columns=['location_name'])

# Define the target variables
#targetvar = ['temperature_celsius', 'wind_kph', 'humidity', 'pressure_mb', 'precip_mm', 'cloud']
targetvar = ['temperature_celsius', 'wind_kph', 'pressure_mb', 'precip_mm']



window_size = 7  # For example, using the past 7 days to predict
target_variable = 'precip_mm'

# Split the data into features (X) and target variables (y)
#X = df_encoded.drop(columns=target_variable)
#y = df_encoded[targetvar]

# Create sliding window features
for i in range(window_size):
    df_encoded[f'{target_variable}_lag_{i+1}'] = df_encoded[target_variable].shift(i+1)

# Drop rows with NaN values caused by shifting
df_encoded = df_encoded.dropna()
# Initialize dictionaries to store models and predictions
models = {}
predictions = {}
mae_scores = {}
mse_scores = {}
example_predictions = {}

# Train a model and predict for each day ahead
for day_ahead in range(1, 4):
    # Define X and y for this day ahead
    X = df_encoded[[f'{target_variable}_lag_{i+1}' for i in range(window_size)]]
    y = df_encoded[target_variable].shift(-day_ahead)

    # Adjust X and y to have the same length
    X = X.iloc[:-day_ahead]
    y = y.iloc[:-day_ahead]

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train the model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Store the model and make predictions
    models[day_ahead] = rf
    pred = rf.predict(X_test)
    predictions[day_ahead] = rf.predict(X_test)

     # Calculate and store metrics
    mae_scores[day_ahead] = mean_absolute_error(y_test, pred)
    mse_scores[day_ahead] = mean_squared_error(y_test, pred)

    # Store example predictions
    example_predictions[day_ahead] = list(zip(y_test.head(5), pred[:5]))

# Print the results in an easy-to-understand format
for day_ahead in range(1, 4):
    print(f"Day {day_ahead} Ahead Forecast:")
    print(f"  Mean Absolute Error: {mae_scores[day_ahead]:.3f}")
    print(f"  Mean Squared Error: {mse_scores[day_ahead]:.3f}")
    print("  Example Predictions (Actual, Predicted):")
    for actual, predicted in example_predictions[day_ahead]:
        print(f"    Actual: {actual:.2f}, Predicted: {predicted:.2f}")
    print("\n")

Day 1 Ahead Forecast:
  Mean Absolute Error: 0.251
  Mean Squared Error: 0.381
  Example Predictions (Actual, Predicted):
    Actual: 0.08, Predicted: 0.10
    Actual: 0.04, Predicted: 0.03
    Actual: 0.00, Predicted: 0.01
    Actual: 0.00, Predicted: 0.00
    Actual: 0.00, Predicted: 0.04


Day 2 Ahead Forecast:
  Mean Absolute Error: 0.213
  Mean Squared Error: 0.205
  Example Predictions (Actual, Predicted):
    Actual: 0.08, Predicted: 0.03
    Actual: 0.00, Predicted: 0.23
    Actual: 0.08, Predicted: 0.01
    Actual: 0.00, Predicted: 0.03
    Actual: 0.00, Predicted: 0.09


Day 3 Ahead Forecast:
  Mean Absolute Error: 0.197
  Mean Squared Error: 0.195
  Example Predictions (Actual, Predicted):
    Actual: 0.00, Predicted: 0.20
    Actual: 0.00, Predicted: 0.12
    Actual: 0.08, Predicted: 0.01
    Actual: 0.03, Predicted: 0.24
    Actual: 0.00, Predicted: 0.04




In [None]:

# Usage example
y_train_seq = ...  # your y_train data
y_test_seq = ...   # your y_test data
target_variable = 'temperature_celsius'  # your target variable

model = LinearRegressionModel(y_train_seq, y_test_seq, 7, 3, target_variable)
model.build_model()
weights = np.linspace(1, 100, num=len(X_train_seq))
model.train_model(sample_weight=weights)  # weights as defined in your notebook
mse, rmse = model.evaluate()
print(f"Mean Squared Error (MSE) on Test Data: {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")



TypeError: object of type 'ellipsis' has no len()

In [None]:
mse = model.evaluate()
print(mse)

In [None]:
predictions = model.predict()
print(predictions)

In [None]:
df = pd.read_csv('docs/data/GlobalWeatherRepository.csv')

# Filtering the DataFrame to keep only the specified cities
cities_to_keep = ['Tokyo', 'Madrid', 'Stockholm', 'Paris']
df_filtered = df[df['location_name'].isin(cities_to_keep)]

# Drop the 'country' column from df_filtered
df_filtered = df_filtered.drop(columns=['country', 'timezone', 'last_updated', 'condition_text', 'wind_direction', 'last_updated_epoch', 'wind_degree',
                                         'air_quality_us-epa-index', 'air_quality_gb-defra-index', 'moon_illumination','sunrise', 'sunset', 'moonset', 'moonrise', 'moon_phase'])
# Now apply one-hot encoding
df_encoded = pd.get_dummies(df_filtered, columns=['location_name'])

# Define the target variables
#targetvar = ['temperature_celsius', 'wind_kph', 'humidity', 'pressure_mb', 'precip_mm', 'cloud']
targetvar = ['temperature_celsius', 'wind_kph', 'pressure_mb', 'precip_mm']


# Split the data into features (X) and target variables (y)
X = df_encoded.drop(columns=targetvar)
y = df_encoded[targetvar]

models = {}
mse_scores = {}
mae_scores = {}

# Train a model for each target variable
for var in targetvar:
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, df_encoded[var], test_size=0.3, random_state=42)

    # Create and train the model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Store the model
    models[var] = rf

    # Make predictions
    predictions = rf.predict(X_test)

    # Calculate and store metrics
    mse_scores[var] = mean_squared_error(y_test, predictions)
    mae_scores[var] = mean_absolute_error(y_test, predictions)

# Print the results
for var in targetvar:
    print(f"{var} - Mean Squared Error: {mse_scores[var]}")
    print(f"{var} - Mean Absolute Error: {mae_scores[var]}")


NameError: name 'adjust_dataset' is not defined

In [None]:
param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [None, -1],
    'positive': [True, False]
}

#Create the grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)

#Fit the grid search to the data
grid_search.fit(X_train_seq, y_train_seq)

#Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

#Get the best model
best_model = grid_search.best_estimator_

#Evaluate the model on the test set
y_pred = best_model.predict(X_test_seq)
mse = mean_squared_error(y_test_seq, y_pred)
print("Mean Squared Error on Test Set:", mse)

Best Hyperparameters: {'copy_X': True, 'fit_intercept': False, 'n_jobs': None, 'positive': True}
Mean Squared Error on Test Set: 8.082073973796033
