# Checking whether an output is either 0 or 1.

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Data Processing Tools
import numpy as np
import pandas as pd

# Data Display Tools
import matplotlib.pyplot as plt

import feature_generation as feat_gen
import data_processing as dat_proc


In [13]:
train_a = pd.read_parquet('A/train_targets.parquet')
X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')


print(X_test_estimated_a.info())
print(X_test_estimated_a["date_forecast"])




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   date_calc                       2880 non-null   datetime64[us]
 1   date_forecast                   2880 non-null   datetime64[us]
 2   absolute_humidity_2m:gm3        2880 non-null   float32       
 3   air_density_2m:kgm3             2880 non-null   float32       
 4   ceiling_height_agl:m            2087 non-null   float32       
 5   clear_sky_energy_1h:J           2880 non-null   float32       
 6   clear_sky_rad:W                 2880 non-null   float32       
 7   cloud_base_agl:m                2582 non-null   float32       
 8   dew_or_rime:idx                 2880 non-null   float32       
 9   dew_point_2m:K                  2880 non-null   float32       
 10  diffuse_rad:W                   2880 non-null   float32       
 11  diff

In [14]:
X_train_estimated_a = X_train_estimated_a.drop("date_calc", axis = 1)

X = pd.concat([X_train_observed_a, X_train_estimated_a], axis = 0, ignore_index=True)

# Removing NaN values. If there are missing values treat start and end points as beginning and end of a line.
X = X.interpolate(method='linear')
X = X.bfill()

# Extract necesarry values for feature generation.
timestamps = "date_forecast"
measurements = list(X.columns.values)
measurements.remove(timestamps)

# Probable features that may be used
der_df = feat_gen.difference_df(X, timestamps, measurements)
dder_df = feat_gen.double_derivative_from_df(X, timestamps, measurements)
int_df = feat_gen.daily_accumulated_val_df(X, timestamps, measurements)
dint_df = feat_gen.daily_accumulated_val_squared_df(X, timestamps, measurements)
time_df = feat_gen.time_data_from_df(X, timestamps)


X = pd.concat([X, der_df, dder_df, int_df, dint_df, time_df], axis = "columns")

y = train_a.dropna()

y['pv_measurement'] = y['pv_measurement'].apply(lambda x: 1 if x > 0 else 0)


# Making sure that the two dataframes match in length.
y_BIG, X = dat_proc.data_length_matching(y, X)

# Get our desired output
y = y_BIG["pv_measurement"]
y = y.reset_index(drop = True)

# Removing datetime object column
X = X.reset_index(drop = True)
X = X.drop(timestamps, axis=1)

  dder_df = dder_df.fillna(method="backfill", axis=None)


In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:


# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Define the hyperparameters to search through
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_clf = grid_search.best_estimator_

# Make predictions on the test data using the best model
predictions = best_clf.predict(X_test)

# Evaluate the accuracy of the best model
accuracy = accuracy_score(y_test, predictions)
print(f"Best Model Accuracy: {accuracy}")
print(f"Best Hyperparameters: {grid_search.best_params_}")



In [None]:
X = X_test_estimated_a

# Removing NaN values. If there are missing values treat start and end points as beginning and end of a line.
X = X.interpolate(method='linear')
X = X.bfill()

# Extract necesarry values for feature generation.
timestamps = "date_forecast"
measurements = list(X.columns.values)
measurements.remove("date_forecast")
measurements.remove("date_forecast")

# Probable features that may be used
der_df = feat_gen.difference_df(X, timestamps, measurements)
dder_df = feat_gen.double_derivative_from_df(X, timestamps, measurements)
int_df = feat_gen.daily_accumulated_val_df(X, timestamps, measurements)
dint_df = feat_gen.daily_accumulated_val_squared_df(X, timestamps, measurements)
time_df = feat_gen.time_data_from_df(X, timestamps)


X = pd.concat([X, der_df, dder_df, int_df, dint_df, time_df], axis = "columns")

# Removing datetime object column
X = X.reset_index(drop = True)
X = X.drop("date_calc", axis=1)
X = X.drop("date_forecast", axis=1)

# Data processing pipeline should be completed :/
pred = best_clf.predict(X)



  dder_df = dder_df.fillna(method="backfill", axis=None)


TypeError: datetime64 type does not support cumsum operations