In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline     
sns.set(color_codes=True)

benin_df = pd.read_csv("../data/benin-malanville.csv")
benin_df.dtypes

1. **Summary statistics**

In [None]:
benin_df.describe(percentiles=[0.5]).T

2. **Data Quality Check**

In [None]:
benin_df.isnull().sum()

From the output above, there are no missing values for all columns from the dataset except for the 'Comments' column.
The total number of missing values for this column *equals* the number of rows for the dataset. This indicates that
there is no data for the 'Comments' column for all records. Therefore, the 'Comments' column must be _dropped_.

In [None]:
# Drop 'Comments' column
benin_df = benin_df.dropna(axis=1,thresh=100000)
benin_df.count()

In [None]:
df_excluded = benin_df.drop(columns=['Timestamp'])
# Function to count outliers for a specific column
def count_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = column[(column < lower_bound) | (column > upper_bound)]
    return len(outliers)

# Count outliers for every column
outlier_counts = df_excluded.apply(count_outliers)

print("Outlier counts for each column:")
print(outlier_counts)

Using interquartile range(IQR) for determining outliers, the output of the cell above indicates the number of outliers for each column.

In [None]:
#Function to count the number of negative values for a given column
def count_negatives(column):
    negtvs = [ x for x in column if x < 0]
    return len(negtvs)
    
# Count outliers for every column
negative_counts = df_excluded.apply(count_negatives)
#negative_counts = count_negatives(benin_df['GHI'])
print("Negative value counts for each column:")
print(negative_counts)

The above output shows that almost half of the dataset contains negative values for the irradiance measurements, which is erroneous. Therefore, those rows containing negative values must be removed before doing further analysis.

In [None]:
for x in benin_df.index:
  if benin_df.loc[x, "GHI"] < 0:
    corrected_df = benin_df.drop(x)
      
print(count_negatives(corrected_df['GHI']))

3.**Time Series Analysis**

_GHI across Timestamp_

In [None]:
'''def plot_df(df, x, y, title="", xlabel='Timestamp', ylabel='GHI', dpi=100):
    plt.figure(figsize=(15,4), dpi=dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()
    
plot_df(benin_df, x=benin_df['Timestamp'], y=benin_df['GHI'], title='Global horizontal irradiance across a perion of time')'''