In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("full_year-may23-may24.csv", index_col="datetime")

Merge of the two csv datasets plus a new one, expanding from November 1st 2023 to August 1st 2023 - May 15 2024
Source: https://www.visualcrossing.com/weather/weather-data-services#

In [3]:
# city values remove extra spaces
df['name'] = df['name'].str.strip()
# calculate null value percentage in columns
null_pct = df.apply(pd.isnull).sum()/df.shape[0]
# use valid_columns as index of columns to keep which have less than 5% null values
valid_columns = df.columns[null_pct < .05]
df = df[valid_columns].copy()

In [4]:
df = df[['name', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'sunrise', 'sunset',
       'moonphase', 'conditions', 'description']]

In [5]:
df.rename(columns = {'name':'location'}, inplace = True)

In [6]:
beijing_city = df[df["location"] == "Beijing"]
berlin_city = df[df["location"] == "Berlin"]
London_city = df[df["location"] == "London"]
mexico_city = df[df["location"] == "Mexico City"]
moscow_city = df[df["location"] == "Moscow"]
ottawa_city = df[df["location"] == "Ottowa"]
paris_city = df[df["location"] == "Paris"]
rome_city = df[df["location"] == "Rome"]
washington_city = df[df["location"] == "Washington DC"]

The workflow:
1) Understand the problem -- find next day's high/low temp; I have data on humidity, temperature, wind speed, etc
2) Identify Key Factors -- What are the independent variables (hint #1) and my dependent variables
3) Hypothesis -- _ significantly impacts 
4) Translate Hypo => Testable Statements -- Null/Alternate Hypothesis (this is an example, I had to do this in school, maybe it will help maybe not)
5) Design the Experiment -- Data collection/cleaning, model selection, evaluation metrics (Research here)
6) Analsys -- train the models, evaluate results (expected temp from your model vs actual temp), testing
7) Success 

- What determines the next day's high and low temperature for each location?
- What do you see from the data at first glance that might impact this?
- How would you need to organize and clean your data?
- Is your model 'universal', meaning does it apply to every location the same (no variance) when it comes to these predictions?
- If not, what factors impact your model?
- What tools or metrics will you need to use to account for these changes?
- Is your model accurate? What causes the inaccuracy, if any?

4) Translate Hypo => Testable Statements -- Null/Alternate Hypothesis (this is an example, I had to do this in school, maybe it will help maybe not)

In [7]:
df_numerical = df[['tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility',
       'moonphase']]

5) Design the Experiment -- Data collection/cleaning, model selection, evaluation metrics (Research here)

- I used a heatmap to visualize the corralatioin between dew point and temperature

- Check Regression assumptions
- Seperate dataframes per location
- R2 and Adjusted R2
- - sns.heatmap Pearson correlation test
- Feature selection
- Hypothesis -- significantly impacts
- Cross validation (Model selection)
- Training and evaluation

The true purpose of regression is to seperate the total variance in lowtemp and hightemp from the variance explained by dew and the variance that is still unexplained.

In [8]:
# VIF < 5: Low multicollinearity.
# VIF between 5 and 10: Moderate multicollinearity. Investigate further.
# VIF > 10: High multicollinearity. Consider removing or combining features.

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

features = ['dew', 'humidity', 'precip', 'precipcover', 'windgust', 'cloudcover', 'visibility']

# Add a constant to the model (intercept)
X = add_constant(df_numerical[features])

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

       feature        VIF
0        const  39.538595
1          dew   1.112560
2     humidity   1.833689
3       precip   1.379022
4  precipcover   1.970144
5     windgust   1.143521
6   cloudcover   1.620783
7   visibility   1.026475


In [10]:
# VIF test approved
# R-squared: 0.962 and adj. R-squared: 0.961
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer

features = ['dew', 'humidity', 'precip', 'precipcover', 'windgust', 'cloudcover', 'visibility']
target = 'tempmax'  # Replace with your actual target variable name

# Assuming df_numerical is your dataframe
X = df_numerical[features]
y = df_numerical[target]

# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regression': SVR()
}

# Define a scoring function
scoring = make_scorer(mean_squared_error, greater_is_better=False)

# Cross-validate each model
results = {}
for model_name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Optional: use if feature scaling is needed
        ('regressor', model)
    ])
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring=scoring)
    results[model_name] = cv_scores

# Print cross-validation results
for model_name, cv_scores in results.items():
    print(f'{model_name}: Mean MSE = {-cv_scores.mean()}, Std = {cv_scores.std()}')

# Based on the results, choose the best model and perform hyperparameter tuning if necessary


Linear Regression: Mean MSE = 4.341556434273872, Std = 1.0187095568102174
Ridge Regression: Mean MSE = 4.341482332808695, Std = 1.0204625826681022
Lasso Regression: Mean MSE = 7.719589862017342, Std = 2.98075859035973
Random Forest: Mean MSE = 8.913726578641015, Std = 8.591948432174004
Gradient Boosting: Mean MSE = 6.128112089400853, Std = 3.8382074902700443
Support Vector Regression: Mean MSE = 10.061998265957772, Std = 6.018005395235186


- Positive coefficients for dew, precip, precipcover suggest these factors increase tempmax.
- Negative coefficients for humidity, windgust, cloudcover, visibility suggest these factors decrease tempmax.

# Hyperparameter tuning to optimize Ridge