In [19]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scratchai.preproccessing import one_hot, split_data, StandardScaler
from scratchai.linear_models import LinearRegression
from scratchai.cart import DecisionTreeRegressor
from scratchai.plotting import plot_generalization_curve
from scratchai.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Load the data
weather_data_raw = pd.read_csv("data\weatherHistory.csv")
weather_data_raw.head()

In [None]:
# rename the dataframe columns
columns = ['Date', 'Sum', 'Precip', 'Temp', 
'Apparent_Temp', 'Humidity', 'Wind_Speed', 'Wind_Bearing', 'Visibility', 'Cloud_Cover', 'Pressure', 'Daily_Sum']

weather_data_raw.columns = columns
weather_data_raw.head()

In [None]:
# drop some useless columns
columns = ['Precip', 'Temp', 'Apparent_Temp', 'Humidity', 'Wind_Speed', 'Visibility', 'Pressure']
weather_data = weather_data_raw[columns]
weather_data.head()

In [None]:
# Clean the data
missing_vals = weather_data.isna().sum()
duplc_rows = weather_data.duplicated().sum()

print(f"Number of missing values: \n{missing_vals}")
print(f"Number of duplicate rows: {duplc_rows}")

In [5]:
weather_data = weather_data.dropna().drop_duplicates()

In [None]:
# Basic visualisations
weather_data.corr(numeric_only = True)

In [None]:
weather_data.describe()

In [None]:
# Plot some graphs
for column in ['Temp', 'Apparent_Temp', 'Humidity', 'Wind_Speed', 'Visibility', 'Pressure']:
    plt.hist(weather_data[column])
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
uniques, counts = np.unique(weather_data['Precip'].values, return_counts = True)
for val, count in zip(uniques, counts):
    print(f"{val}: {count}")

In [None]:
data = weather_data.iloc[0: 5000]
sns.pairplot(data, hue = 'Precip', corner = True)
plt.show()

In [8]:
# handle outliers in th data
weather_data.loc[:, 'Temp'] = weather_data['Temp'].clip(lower = -10, upper = 40)
weather_data.loc[:, 'Humidity'] = weather_data['Humidity'].clip(lower = 0.2, upper = 1)
weather_data.loc[:, 'Wind_Speed'] = weather_data['Wind_Speed'].clip(lower = 0, upper = 35)

weather_data = weather_data.loc[weather_data['Pressure'] >= 900 ,:]
weather_data.loc[:, 'Pressure'] = weather_data['Pressure'].clip(lower = 990,upper = 1050)

In [None]:
input_features = ['Humidity', 'Wind_Speed', 'Visibility', 'Pressure', 'Precip']
label = 'Temp'

processed_data = weather_data[input_features + [label]]
processed_data.head()

In [None]:
# ont hot the Precip

processed_data = one_hot(processed_data, columns = ['Precip'])
processed_data.head()

In [None]:
# scale the data
scaler = StandardScaler()
processed_data = scaler.transform(processed_data, columns = input_features[:-1])
processed_data.head()

In [12]:
# split the data
train_data, test_data = split_data(processed_data, 0.7)
linear_reg_test, valid_data = split_data(test_data, 0.5)

X_train, y_train = train_data.drop(label, axis = 1).values, train_data[label].values
X_valid, y_valid = valid_data.drop(label, axis = 1).values, valid_data[label].values

In [13]:
# build and train a linear regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train, 0.0001, 50, 32, 1, X_valid, y_valid)

In [None]:
# plot the regularization curve to make sure the model didn't overfit the data
plot_generalization_curve(linear_regressor.training_lossses, linear_regressor.validation_losses, linear_regressor.training_epochs)

In [None]:
# evaluate the linear regression model
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def evaluate_model(y_pred, y_true, model_name = ""):
    for metric, func in [('Root mean squared error', rmse), ('Mean absolute error', mean_absolute_error)]:
        print(f"{model_name} {metric}: {func(y_true, y_pred):.2f}")
        
y_pred = linear_regressor.predict(X_valid)
evaluate_model(y_pred, y_valid, 'Linear Regression')

In [16]:
# prepare data for the decision tree
tree_data = weather_data[input_features + [label]]

# split the tree data
train_data, test_data = split_data(tree_data, 0.6)
tree_test, valid_data = split_data(test_data, 0.5)

X_train, y_train = train_data[input_features].values, train_data[label].values
X_valid, y_valid = valid_data[input_features].values, valid_data[label].values

In [None]:
# build and train the decision tree
model = DecisionTreeRegressor(min_samples_split = 500, max_depth = 25)
model.fit(X_train, y_train)

In [None]:
# evaluate the decision tree
y_pred = model.predict(X_valid)
evaluate_model(y_pred, y_valid, 'Decision Tree')

In [None]:
# evaluate both models on the test data
X_test, y_test = linear_reg_test.drop(label, axis = 1).values, linear_reg_test[label].values
y_pred = linear_regressor.predict(X_test)

evaluate_model(y_pred, y_test, 'Linear Regression')

X_test, y_test = tree_test[input_features].values, tree_test[label].values
y_pred = model.predict(X_test)
evaluate_model(y_pred, y_test, 'Decision Tree')