## Importing the Dependencies

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import os
os.listdir("data")


## LOAD DATA

In [None]:
import pandas as pd

df = pd.read_csv('data/benin-malanville.csv', parse_dates=['Timestamp'])
df.head()

## Summary Statistics & Missing-Value Report

In [None]:
import pandas as pd

# Numeric summary
df.describe()

# Missing values
missing = df.isna().sum()
print(missing[missing > 0])

# Columns with >5% nulls
cols_over_5pct = missing[missing/len(df) > 0.05].index.tolist()
print(cols_over_5pct)


## Outlier Detection & Basic Cleaning

In [None]:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

# Compute Z-scores
z_scores = np.abs(stats.zscore(df[cols_to_check], nan_policy='omit'))
outlier_rows = (z_scores > 3).any(axis=1)

# Flag rows with outliers
df['Outlier'] = outlier_rows

# Fill missing values in key columns with median
df[cols_to_check] = df[cols_to_check].fillna(df[cols_to_check].median())

NameError: name 'df' is not defined

## Export Cleaned Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df.to_csv('data/benin-malanville.csv', index=False)


## Time Series Analysis

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
plt.plot(df['Timestamp'], df['GHI'], label='GHI')
plt.plot(df['Timestamp'], df['DNI'], label='DNI')
plt.plot(df['Timestamp'], df['DHI'], label='DHI')
plt.xlabel('Timestamp')
plt.ylabel('Solar Irradiance')
plt.title('Solar Irradiance Over Time')
plt.legend()
plt.show()



## Cleaning Impact

In [None]:
df.groupby('Outlier')[['ModA','ModB']].mean().plot(kind='bar')
plt.title('Average ModA & ModB Pre/Post-Cleaning')
plt.show()


## Correlation & Relationship Analysis

In [4]:
import seaborn as sns

# Heatmap
sns.heatmap(df[['GHI','DNI','DHI','ModA','ModB']].corr(), annot=True, cmap='coolwarm')

# Scatter plots
sns.scatterplot(data=df, x='WS', y='GHI')
sns.scatterplot(data=df, x='RH', y='Tamb')


KeyboardInterrupt: 

## Wind & Distribution Analysis

In [None]:
# Histogram example
df['GHI'].hist(bins=50)
plt.show()

# Wind rose (requires windrose library)
# pip install windrose
from windrose import WindroseAxes
ax = WindroseAxes.from_ax()
ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()
plt.show()


## Temperature & Bubble Chart

In [None]:
plt.scatter(df['Tamb'], df['GHI'], s=df['RH']*2, alpha=0.5)
plt.xlabel('Tamb')
plt.ylabel('GHI')
plt.title('GHI vs Tamb (bubble=RH)')
plt.show()
