In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # For regression problems
from sklearn.ensemble import RandomForestClassifier  # For classification problems
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [19]:
# Reading the CSV file and skipping bad lines
df = pd.read_csv("aug31.csv")  # Adjust 'sep' if a different delimiter is used

# Display the first few rows
df.head()


Unnamed: 0,YEAR,DOY,RH2M,T2M
0,1985,1,49.56,19.18
1,1985,2,65.06,18.37
2,1985,3,61.88,19.28
3,1985,4,59.75,19.23
4,1985,5,58.88,18.62


In [20]:
# Define features (X) and target (y)
X = df[['YEAR', 'DOY', 'RH2M']]  # Features
y = df['T2M']  # Target variable

In [21]:
# Handle missing values if any
X.fillna(X.mean(), inplace=True)
y.fillna(y.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
# Initialize the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)


In [24]:
# Train the model
model.fit(X_train, y_train)

In [25]:
# Make predictions
y_pred = model.predict(X_test)

In [26]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 136.36106974257063


In [27]:
def predict_next_30_days(model, start_doy, rh2m_value, year=2024):
  

    # Generate DOY values for the next 30 days
    future_doy = np.arange(start_doy, start_doy + 30)

    predictions = []

    # Iterate through each DOY to make predictions
    for doy in future_doy:
        # Create the input data for prediction
        input_data = pd.DataFrame({
            'YEAR': [year],
            'DOY': [doy],
            'RH2M': [rh2m_value]
        })
        
        # Predict the T2M for the given day
        predicted_t2m = model.predict(input_data)[0]
        
        # Append the result to predictions
        predictions.append((doy, predicted_t2m))
    
    return predictions


In [31]:
# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive for interpretation
cv_scores = -cv_scores

# Calculate mean and standard deviation of the scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

# Display results
print("Cross-Validation Scores (MSE):", cv_scores)
print(f"Mean CV Score (MSE): {mean_cv_score:.2f}")
print(f"Standard Deviation of CV Scores: {std_cv_score:.2f}")

Cross-Validation Scores (MSE): [   2.07403625    2.52186035    2.10312591    2.32732565 1104.97606127]
Mean CV Score (MSE): 222.80
Standard Deviation of CV Scores: 441.09


In [32]:
# Call the function with the trained model and desired parameters
start_doy = 245
rh2m_value = 65  # Example relative humidity
predictions = predict_next_30_days(model, start_doy, rh2m_value)

# Display the predictions in a table-like format
print(f"{'DOY':<5} {'Predicted T2M':<15}")
print("=" * 20)
for doy, t2m in predictions:
    print(f"{doy:<5} {t2m:<15.2f}")

DOY   Predicted T2M  
245   31.22          
246   31.22          
247   31.22          
248   31.12          
249   31.05          
250   31.05          
251   31.05          
252   31.05          
253   31.03          
254   31.03          
255   31.01          
256   31.01          
257   31.01          
258   30.98          
259   30.98          
260   30.95          
261   30.92          
262   30.92          
263   30.85          
264   30.82          
265   30.76          
266   30.74          
267   30.05          
268   29.48          
269   29.16          
270   29.09          
271   29.08          
272   29.08          
273   29.07          
274   28.81          


Cross-Validation Scores (MSE): [   2.07403625    2.52186035    2.10312591    2.32732565 1104.97606127]
Mean CV Score (MSE): 222.80
Standard Deviation of CV Scores: 441.09
