In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the CSV file
data = pd.read_csv("BC-Data-Set.csv")

In [2]:
# Convert the date column to a datetime object
data['date'] = pd.to_datetime(data['date'])

# Remove any missing values
data = data.dropna()

data = data.set_index('date')

In [6]:
#here we fill only the days where less than 3 consecutive missing values are present

mean_values=data.mean()
test= data.groupby([data.index.date])
for group_name, group_data in test:
    if (len(group_data)<24 and len(group_data)>17):
        existing_hours=group_data.index.hour.unique()
        missing_hours = set(range(24)) - set(existing_hours)
        missing_rows = pd.DataFrame(columns=group_data.columns)
        for missing_hour in missing_hours:
            datetime_obj = pd.to_datetime(group_data.index.date[0]) + pd.to_timedelta(missing_hour, unit='H')
            
            if(datetime_obj + pd.Timedelta(hours=1) in data.index) and (datetime_obj - pd.Timedelta(hours=1) in data.index):
                 data.loc[datetime_obj] = (data.loc[datetime_obj-pd.Timedelta(hours=1)] + data.loc[datetime_obj+pd.Timedelta(hours=1)]) / 2
                 
            elif (datetime_obj - pd.Timedelta(hours=1) in data.index):
                data.loc[datetime_obj] = (data.mean() + data.loc[datetime_obj-pd.Timedelta(hours=1)]) / 2
            else:
                print("not filled")
                    

not filled
not filled
not filled
not filled
not filled
not filled


  for group_name, group_data in test:


In [7]:
# Count the number of unique hours for each date
hours_per_day = data.groupby(data.index.date).size()

print(hours_per_day)

# Get the dates that have less than 24 unique hours
incomplete_days = hours_per_day[hours_per_day < 24].index

print(incomplete_days)

# Convert the incomplete_days array to a DatetimeIndex
date_index = pd.DatetimeIndex(data.index.date)

# Filter the DataFrame to exclude the rows corresponding to incomplete days
data_complete_days = data[~date_index.isin(incomplete_days)]

data_incomplete_days = data[date_index.isin(incomplete_days)]

data_complete_days.to_csv("semi_filled")


2019-01-01    24
2019-01-02    24
2019-01-03    24
2019-01-04    24
2019-01-08    24
              ..
2019-12-02    24
2019-12-03    24
2019-12-04    13
2019-12-09    24
2019-12-10    24
Length: 191, dtype: int64
Index([2019-01-23, 2019-01-24, 2019-02-02, 2019-02-08, 2019-02-10, 2019-02-25,
       2019-02-26, 2019-02-27, 2019-02-28, 2019-03-10, 2019-04-03, 2019-05-01,
       2019-07-01, 2019-07-02, 2019-07-03, 2019-07-10, 2019-07-23, 2019-07-27,
       2019-08-02, 2019-08-04, 2019-08-10, 2019-08-22, 2019-09-04, 2019-10-15,
       2019-10-16, 2019-12-04],
      dtype='object')


In [None]:
data=data.sort_index()
data

In [None]:
data.to_csv("filled_mean.csv")

In [None]:
data.describe()

In [None]:
correlations = data.corr()
bc_correlations = correlations['BC']
print(bc_correlations)

In [None]:
# Scatter plots
sns.pairplot(data, x_vars=data.columns[1:], y_vars=['BC'])

# Heat map
plt.figure(figsize=(10, 7))
sns.heatmap(correlations, annot=True, cmap="coolwarm")
plt.show()

In [None]:
# Plot the time series data
data.plot(subplots=True, figsize=(12, 16))
plt.show()

threshold = 5 # theshold a little high to retain some outliers
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
outliers = (z_scores > threshold).any(axis=1)
for column in data.columns:
    column_mean = data[column].mean()
    data.loc[outliers, column] = column_mean
data.plot(subplots=True, figsize=(12, 16))
plt.show()

In [None]:
data.describe()

In [None]:
# Daily averages
daily_data = data.resample('D').mean()

# Weekly averages
weekly_data = data.resample('W').mean()

# Visualize the daily and weekly trends using line charts
daily_data.plot(subplots=True, figsize=(12, 16), title='Daily Averages')
weekly_data.plot(subplots=True, figsize=(12, 16), title='Weekly Averages')

# Visualize the daily and weekly trends using box plots
daily_data.boxplot(figsize=(12, 7))
plt.title('Daily Averages')
weekly_data.boxplot(figsize=(12, 7))
plt.title('Weekly Averages')

plt.show()

In [None]:
def inspect_dataframe(df, columns):
    figs, axs = plt.subplots(len(columns), 1, sharex=True, figsize=(17,17))
    for i, col in enumerate(columns):
        axs[i].plot(df[col])
        axs[i].set_title(col)
    plt.show()
inspect_dataframe(data, data.columns)

In [None]:
test_size = 24*25

X_train_raw = data.iloc[:-test_size]
# y_train_raw = y.iloc[:-test_size]
X_test_raw = data.iloc[-test_size:]
# y_test_raw = y.iloc[-test_size:]
print(X_train_raw.shape, X_test_raw.shape)

# Normalize both features and labels
X_min = X_train_raw.min()
X_max = X_train_raw.max()

X_train_raw = (X_train_raw-X_min)/(X_max-X_min)
X_test_raw = (X_test_raw-X_min)/(X_max-X_min)

plt.figure(figsize=(20,5))
plt.plot(X_train_raw.BC, label='Train (temperature)')
plt.plot(X_test_raw.BC, label='Test (temperature)')
plt.title('Train-Test Split')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(30,5))
plt.plot(X_test_raw.BC, label='Test (temperature)')
plt.title('Train-Test Split')
plt.legend()
plt.show()