In [None]:
import Data_Editing_Helpers as DEH
import Classifier as CLS
import Regressor as RGS
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, r2_score
from sklearn.ensemble import StackingRegressor, StackingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.model_selection import cross_val_score

In [None]:
## Loading ##
test = pd.read_csv("Data/test.csv")
train = pd.read_csv("Data/train.csv")

In [None]:
y_name = 'rainfall' # What you're trying to predict
x_name = 'id' # User id. Drop this column

# Set this to True if you want to run regression models, False for classification models
is_regression = True

In [None]:
# Display basic info about datasets
train_info = train.info()
test_info = test.info()

# Display first few rows
train_head = train.head()
test_head = test.head()

# Check for missing values
missing_values_train = train.isnull().sum()
missing_values_test = test.isnull().sum()

# Summary statistics
train_description = train.describe()
test_description = test.describe()

train_info, test_info, train_head, test_head, missing_values_train, missing_values_test, train_description, test_description


In [None]:
## Wrangling ##
#Future implementation will remove map_seasons and convert_strings_to_ascii
train, test = DEH.map_seasons(train, test)

#train = DEH.convert_strings_to_ascii(train)
#test = DEH.convert_strings_to_ascii(test)
train, test = DEH.dropUnusedColumns(train, test, y_name, x_name)
train = DEH.remove_blank_rows(train, y_name)
train, test = DEH.fill_NA(train, test, 0)

In [None]:
from sklearn.preprocessing import StandardScaler

# Convert winddirection into sin and cos
train['winddirection_sin'] = np.sin(np.deg2rad(train['winddirection']))
train['winddirection_cos'] = np.cos(np.deg2rad(train['winddirection']))
test['winddirection_sin'] = np.sin(np.deg2rad(test['winddirection']))
test['winddirection_cos'] = np.cos(np.deg2rad(test['winddirection']))

# Add new feature temperature_range
train['temperature_range'] = train['maxtemp'] - train['mintemp']
test['temperature_range'] = test['maxtemp'] - test['mintemp']

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'temperature_range', 'winddirection_sin', 'winddirection_cos']

train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

In [None]:
## Visualizing ##
DEH.makeSNS(train)   # This outputs all graphs, can be annoying

#Tempary drop of columns for current Dataset to test stacking
test = test.drop(columns=['day', 'pressure', 'maxtemp', 'temparature', "mintemp", 'winddirection', "dewpoint"])
train = train.drop(columns=['day', 'pressure', 'maxtemp', 'temparature', "mintemp", 'winddirection', "dewpoint"])

DEH.makeSNS(train)  


In [None]:
# Using 99 as a filler for NA's. Will change to use Random Forest for filling NA's 
train, test = DEH.fill_NA(train, test, fill=99)
X_train, X_test, y_train, y_test = DEH.traintestslpit(train, y_name)

In [None]:


def evaluate_model(model_func, X_train, y_train, X_test, y_test, model_name, results, is_regression):
    model = model_func(X_train, y_train)
    predictions = model.predict(X_test)
    if is_regression:
        score = r2_score(y_test, predictions) * 100
    else:
        score = accuracy_score(y_test, predictions) * 100
    results.append({"model": model_name, "score": score, "model_obj": model})
    print(f"{model_name} score: {score:.3f}")

## Training Models ##
results = []

if is_regression:
    evaluate_model(RGS.decisiontreeRegressor, X_train, y_train, X_test, y_test, "Decision Tree Regressor", results, is_regression)
    evaluate_model(RGS.linearRegressor, X_train, y_train, X_test, y_test, "Linear Regressor", results, is_regression)
    evaluate_model(RGS.ridgeRegressor, X_train, y_train, X_test, y_test, "Ridge Regressor", results, is_regression)
    evaluate_model(RGS.lassoRegressor, X_train, y_train, X_test, y_test, "Lasso Regressor", results, is_regression)
    evaluate_model(RGS.randomForestRegressor, X_train, y_train, X_test, y_test, "Random Forest Regressor", results, is_regression)
    evaluate_model(RGS.gradientBoostingRegressor, X_train, y_train, X_test, y_test, "Gradient Boosting Regressor", results, is_regression)
    evaluate_model(RGS.catBoostRegressor, X_train, y_train, X_test, y_test, "Cat Boost Regressor", results, is_regression)
    evaluate_model(RGS.knnRegressor, X_train, y_train, X_test, y_test, "KNN Regressor", results, is_regression)
    evaluate_model(RGS.xgBoostRegressor, X_train, y_train, X_test, y_test, "XGBoost Regressor", results, is_regression)

else:
    evaluate_model(CLS.decisiontreeClassifier, X_train, y_train, X_test, y_test, "Decision Tree Classifier", results, is_regression)
    evaluate_model(CLS.knnClassifier, X_train, y_train, X_test, y_test, "KNN Classifier", results, is_regression)
    evaluate_model(CLS.adaboostClassifier, X_train, y_train, X_test, y_test, "AdaBoost Classifier", results, is_regression)



In [None]:
# Evaluate the stacking model
for model in results:
    model['cv_score'] = cross_val_score(model['model_obj'], X_train, y_train, cv=5, scoring='r2').mean()

top_3_models = sorted(results, key=lambda x: x['cv_score'], reverse=True)[:3]

# Create the stacking model
if is_regression:
    estimators = [(model['model'], model['model_obj']) for model in top_3_models]
    stacking_model = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=1.0))
else:
    estimators = [(model['model'], model['model_obj']) for model in top_3_models]
    stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Fit the stacking model
stacking_model.fit(X_train, y_train)

# Evaluate the stacking model
stacking_predictions = stacking_model.predict(X_test)
if is_regression:
    stacking_score = r2_score(y_test, stacking_predictions) * 100
else:
    stacking_score = accuracy_score(y_test, stacking_predictions) * 100

print(f"Stacking Model score: {stacking_score:.3f}")

In [None]:
# Make predictions using the stacking model
stacking_predictions_submission = stacking_model.predict(test)


In [None]:
# Print the top 3 models
print("Top 3 Models:")
for model in top_3_models:
    print(f"{model['model']}: {model['score']:.3f}")

In [None]:
# Save the predictions to a CSV file for submission
test = pd.read_csv('Data/test.csv')
submission = pd.DataFrame({
    x_name: test[x_name],
    y_name: stacking_predictions_submission
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")