In [5]:
import pandas as pd
from operations.adjust_dataset import adjust_dataset
from operations.lin import LinearRegressionModel
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv('docs/data/GlobalWeatherRepository.csv')

df = adjust_dataset(df, 'Kabul')
window_size = 7
output_size = 3
# Create lagged features for temperature_celsius
for lag in range(1, window_size + 1):  # Create lags up to the window size
    df[f'temp_lag_{lag}'] = df['temperature_celsius'].shift(lag)

# Function to create sequences for a single feature (temperature_celsius)
def create_temp_sequences(data, window_size, output_size):
    X, y = [], []
    for i in range(len(data) - window_size - output_size + 1):
        X.append(data[i:(i + window_size)])
        y.append(data[i + window_size:i + window_size + output_size])
    return np.array(X), np.array(y)

# Create sequences using only temperature_celsius
temp_data = df['temperature_celsius']
X_seq, y_seq = create_temp_sequences(temp_data, window_size, output_size)


# Sequential Train-Test Split
train_size_seq = int(len(X_seq) * 0.7)
X_train_seq, X_test_seq = X_seq[:train_size_seq], X_seq[train_size_seq:]
y_train_seq, y_test_seq = y_seq[:train_size_seq], y_seq[train_size_seq:]

# Create weights for the training set - higher weights for more recent data
# The weights increase linearly from a starting value (e.g., 1) to an end value (e.g., 10)
weights = np.linspace(1, 100, num=len(X_train_seq))


# Train the model with these weights
model = LinearRegression(copy_X = True, fit_intercept = False, n_jobs = None, positive = True)
model.fit(X_train_seq, y_train_seq, sample_weight=weights)

# Model is now trained on past temperature data to predict future temperatures
# Make predictions on the test set
y_pred_seq = model.predict(X_test_seq)

# Evaluate the model
mse_seq = mean_squared_error(y_test_seq, y_pred_seq)


# Print the evaluation results
print(f"Mean Squared Error (MSE) on Test Data: {mse_seq}")
rnse = np.sqrt(mse_seq)
print(rnse)



Mean Squared Error (MSE) on Test Data: 2.4209939056897043
1.555954339204626


Random tree forest search

In [12]:
df = pd.read_csv('GlobalWeatherRepository.csv')
targetvar = ['temperature_celsius', 'wind_kph', 'humidity', 'pressure_mb', 'precip_mm', 'cloud']

# 3. Split the data into features (X) and target variables (y)
X = df.drop(columns=targetvar)
y = df[targetvar]

# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Create and train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)  # You can tune these hyperparameters
rf.fit(X_train, y_train)

# 6. Make predictions and evaluate the model
predictions = rf.predict(X_test)

# Evaluate the performance
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")



               country     location_name  latitude  longitude  \
0          Afghanistan             Kabul     34.52      69.18   
1              Albania            Tirana     41.33      19.82   
2              Algeria           Algiers     36.76       3.05   
3              Andorra  Andorra La Vella     42.50       1.52   
4               Angola            Luanda     -8.84      13.23   
5  Antigua and Barbuda      Saint John's     17.12     -61.85   
6            Argentina      Buenos Aires    -34.59     -58.67   
7              Armenia           Yerevan     40.18      44.51   
8            Australia          Canberra    -35.28     149.22   
9              Austria            Vienna     48.20      16.37   

                         timezone  last_updated_epoch      last_updated  \
0                      Asia/Kabul          1693301400  2023-08-29 14:00   
1                   Europe/Tirane          1693301400  2023-08-29 11:30   
2                  Africa/Algiers          1693301400  2023

In [7]:

# Usage example
y_train_seq = ...  # your y_train data
y_test_seq = ...   # your y_test data
target_variable = 'temperature_celsius'  # your target variable

model = LinearRegressionModel(y_train_seq, y_test_seq, 7, 3, target_variable)
model.build_model()
weights = np.linspace(1, 100, num=len(X_train_seq))
model.train_model(sample_weight=weights)  # weights as defined in your notebook
mse, rmse = model.evaluate()
print(f"Mean Squared Error (MSE) on Test Data: {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")



TypeError: object of type 'ellipsis' has no len()

In [None]:
mse = model.evaluate()
print(mse)

In [None]:
predictions = model.predict()
print(predictions)

NameError: name 'adjust_dataset' is not defined

In [None]:
param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [None, -1],
    'positive': [True, False]
}

#Create the grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)

#Fit the grid search to the data
grid_search.fit(X_train_seq, y_train_seq)

#Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

#Get the best model
best_model = grid_search.best_estimator_

#Evaluate the model on the test set
y_pred = best_model.predict(X_test_seq)
mse = mean_squared_error(y_test_seq, y_pred)
print("Mean Squared Error on Test Set:", mse)

Best Hyperparameters: {'copy_X': True, 'fit_intercept': False, 'n_jobs': None, 'positive': True}
Mean Squared Error on Test Set: 8.082073973796033
