In [14]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
file_path = r'D:\00. MSc Data Analytic\CIND820\Data\Crop data\Final_Data.xlsx'
crop_data = pd.read_excel(file_path, sheet_name='Crop')
weather_data = pd.read_excel(file_path, sheet_name='Weather')

# Remove unneeded columns
columns_to_keep = [
    'County', 'Year', 'Month', 'Day',
    'Max Temp (°C)', 'Min Temp (°C)', 'Mean Temp (°C)',
    'Heat Deg Days (°C)', 'Total Rain (mm)', 'Total Snow (cm)',
    'Total Precip (mm)', 'Snow on Grnd (cm)', 'Spd of Max Gust (km/h)'
]
weather_data = weather_data[columns_to_keep]

# Remove rows where 'Max Temp (°C)', 'Min Temp (°C)', and 'Mean Temp (°C)' are all null
weather_data = weather_data.dropna(subset=['Max Temp (°C)', 'Min Temp (°C)', 'Mean Temp (°C)'], how='all')

# Perform regression imputation
imputer_columns = ['Max Temp (°C)', 'Min Temp (°C)', 'Mean Temp (°C)', 'Heat Deg Days (°C)', 'Total Rain (mm)', 'Total Precip (mm)', 'Spd of Max Gust (km/h)']
imputer = IterativeImputer(estimator=LinearRegression(), random_state=0, max_iter=200)
imputed_data = imputer.fit_transform(weather_data[imputer_columns])
imputed_df = pd.DataFrame(imputed_data, columns=imputer_columns)
imputed_weather_data = pd.concat([weather_data[['County', 'Year', 'Month', 'Day', 'Total Snow (cm)', 'Snow on Grnd (cm)']], imputed_df], axis=1)

# Replace negative values with 0 in specified columns
imputed_weather_data[['Total Rain (mm)', 'Total Precip (mm)', 'Spd of Max Gust (km/h)']] = imputed_weather_data[['Total Rain (mm)', 'Total Precip (mm)', 'Spd of Max Gust (km/h)']].clip(lower=0)

In [15]:
# Remove rows with outliers in specified columns
def remove_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (column >= lower_bound) & (column <= upper_bound)

columns_to_filter = ['Spd of Max Gust (km/h)', 'Total Precip (mm)', 'Total Rain (mm)', 'Total Snow (cm)', 'Snow on Grnd (cm)']
for col in columns_to_filter:
    imputed_weather_data = imputed_weather_data[remove_outliers(imputed_weather_data[col])]

In [16]:
# Group the data by County and Year and aggregate the data
grouped_df = imputed_weather_data.groupby(['County', 'Year']).agg({
    'Max Temp (°C)': ['mean', 'max', 'min', 'std'],
    'Min Temp (°C)': ['mean', 'max', 'min', 'std'],
    'Mean Temp (°C)': ['mean', 'max', 'min', 'std'],
    'Heat Deg Days (°C)': ['mean', 'max', 'min', 'std'],
    'Total Rain (mm)': ['mean', 'max', 'min', 'std'],
    'Total Snow (cm)': ['mean', 'max', 'min', 'std'],
    'Total Precip (mm)': ['mean', 'max', 'min', 'std'],
    'Snow on Grnd (cm)': ['mean', 'max', 'min', 'std'],
    'Spd of Max Gust (km/h)': ['mean', 'max', 'min', 'std'],
    })

# Flatten the resulting column names by joining the multi-index columns
grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
grouped_df = grouped_df.reset_index()

In [17]:
# Merge the DataFrames on the "County" and "Year" columns, then clear all record with no Production
merged_df = pd.merge(grouped_df, crop_data, on=["County", "Year"])
merged_df["Yield (kg/Acres seeded)"] = merged_df.apply(lambda row: (row["Sum of Production ('000 tonnes)"] * 1000) / row["Sum of Acres seeded"] if row["Sum of Acres seeded"] != 0 else 0, axis=1)
merged_df = merged_df.drop(merged_df[merged_df["Yield (kg/Acres seeded)"] == 0].index)
merged_df = merged_df.drop(["Sum of Production ('000 tonnes)", "Sum of Acres seeded"], axis=1)
merged_df = merged_df.dropna()

In [18]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import VotingRegressor


# Remove 'County', 'Product', 'Year' columns
df = merged_df.drop(['County', 'Product', 'Year'], axis=1)

# Define the attribute sets
set1 = ['Max Temp (°C)_mean', 'Max Temp (°C)_min', 'Min Temp (°C)_max', 'Spd of Max Gust (km/h)_max', 'Spd of Max Gust (km/h)_std']
set2 = ['Min Temp (°C)_mean', 'Mean Temp (°C)_std', 'Total Rain (mm)_min', 'Total Snow (cm)_max', 'Total Snow (cm)_min', 'Total Precip (mm)_min', 'Snow on Grnd (cm)_max', 'Snow on Grnd (cm)_min', 'Spd of Max Gust (km/h)_min']
set3 = set1 + set2

def evaluate_ensemble(attributes, model1, model2):
    X = df[attributes]
    y = df['Yield (kg/Acres seeded)']

    ensemble = VotingRegressor([('model1', model1), ('model2', model2)], n_jobs=-1)
    ensemble_scores = cross_val_score(ensemble, X, y, cv=KFold(n_splits=10), scoring='neg_mean_squared_error')

    return -ensemble_scores.mean()

# Decision Tree model of Set 2
dt_set2 = DecisionTreeRegressor()

# SVM model of Set 3 with specified parameters
svm_set3 = SVR(C=10, gamma='auto', kernel='rbf')

# Evaluate ensemble of Set 2 Decision Tree and Set 3 SVM
ensemble_mse = evaluate_ensemble(set2, dt_set2, svm_set3)
print(f"Ensemble of Set 2 Decision Tree and Set 3 SVM - Mean Squared Error: {ensemble_mse}")


Ensemble of Set 2 Decision Tree and Set 3 SVM - Mean Squared Error: 17333.503876383373
