In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pd.read_csv(d1_path)

In [11]:
df.columns

Index(['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS',
       'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation',
       'TModA', 'TModB', 'Comments'],
      dtype='object')

### Data Quality Check

#### Check for missing values, outliers or incorrect entries, columns like (GHI, DNI, DHI) and check for outliers in columns like (ModA, ModB, WS, WSgust)

In [19]:
# Check for null values
null_values = df.isnull().sum()

print('Null values: ', null_values)

# Check for negative values in GHI, DNI, DHI columns
neg_values = df[(df['GHI'] < 0) | (df['DNI'] < 0) | (df['DHI'] < 0)]
print('Negative values: ', len(neg_values))

Null values:  Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64
Negative values:  278722


In [7]:
def clean_data(data):
    # Drop rows with missing values in important columns (e.g., GHI, DNI)
    df_cleaned = df.dropna(subset=['GHI', 'DNI', 'DHI', 'ModA', 'ModB'])
    
    # Replace or impute missing values in other columns
    df_cleaned = df_cleaned['WSgust'].fillna(df_cleaned['WSgust'].mean())
    
    # Remove rows with negative values where not appropriate
    df_cleaned = df_cleaned[(df_cleaned['GHI'] >= 0) & (df_cleaned['DNI'] >= 0) & 
                            (df_cleaned['DHI'] >= 0) & (df_cleaned['ModA'] >= 0) & 
                            (df_cleaned['ModB'] >= 0) & (df_cleaned['WS'] >= 0)]

    return df_cleaned

In [8]:
def eda(data):
    # Convert the time column to datetime
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    
    # Plot time series for GHI, DNI, DHI, and Tamb
    plt.figure(figsize=(12, 6))
    plt.plot(df['Timestamp'], df['GHI'], label='GHI')
    plt.plot(df['Timestamp'], df['DNI'], label='DNI')
    plt.plot(df['Timestamp'], df['DHI'], label='DHI')
    plt.plot(df['Timestamp'], df['Tamb'], label='Tamb')
    plt.legend()
    plt.title('Time Series of GHI, DNI, DHI, and Tamb')
    plt.show()
    
    # Correlation heatmap
    correlation = df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'WS', 'WSgust', 'WD']].corr()
    sns.heatmap(correlation, annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.show()
    
    # Scatter plot of RH vs Temperature and GHI
    plt.figure(figsize=(12, 6))
    plt.scatter(df['RH'], df['Tamb'], alpha=0.6, label='Temperature vs RH')
    plt.scatter(df['RH'], df['GHI'], alpha=0.6, label='GHI vs RH')
    plt.legend()
    plt.title('Impact of Relative Humidity on Temperature and Solar Radiation')
    plt.show()
    
    # Histograms of key variables
    df[['GHI', 'DNI', 'DHI', 'WS', 'Tamb']].hist(bins=30, figsize=(12, 8))
    plt.suptitle('Histograms of GHI, DNI, DHI, WS, and Tamb')
    plt.show()