## ðŸ““ `notebooks/sierra_leone.ipynb`
# SierraLeone EDA - Interim Submission
This notebook performs initial data profiling for Sierraleone's solar dataset and outlines the plan for full EDA.

In [None]:
! pip install pandas numpy seaborn matplotlib scipy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data (placeholder)
# If dataset is not yet downloaded, mention in report how it will be added
#https://drive.google.com/file/d/1EAZatSm02HRvxpVbhx7ih_pv1hyOizzb/view?usp=sharing
file_id = "1EAZatSm02HRvxpVbhx7ih_pv1hyOizzb "
file_url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(file_url)
print(df.head())

df.head()

In [None]:
# Quick profiling
df.info()
df.describe()
#df.isna().sum()

In [None]:
df.isna().sum()

In [None]:
#Outlier Detection & Cleaning
#Check for outliers using Z-scores (to detect unusual values):
from #scipy import stats
import numpy as np

z = np.abs(stats.zscore(df[['GHI','DNI','DHI','ModA','ModB','WS','WSgust']]))
df_clean = df[(z < 3).all(axis=1)]

In [None]:
#Handling missing values
df_clean.fillna(df_clean.median(numeric_only=True), inplace=True)

In [None]:
print(df.columns)


In [None]:
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'], errors='coerce')


In [None]:
df_clean = df_clean.sort_values('Timestamp')
df_clean = df_clean.dropna(subset=['Timestamp', 'GHI', 'DNI', 'DHI'])

In [None]:
#Time-Series Analysis
#Plot GHI, DNI, DHI, and Tamb vs Timestamp.

plt.figure(figsize=(12,6))
plt.plot(df_clean['Timestamp'], df_clean['GHI'], label='GHI')
plt.plot(df_clean['Timestamp'], df_clean['DNI'], label='DNI', alpha=0.7)
plt.plot(df_clean['Timestamp'], df_clean['DHI'], label='DHI', alpha=0.7)
plt.title('Benin Solar Radiation Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Irradiance (W/mÂ²)')
plt.legend()
plt.show()

In [None]:
#Cleaning Impact
df_clean.groupby('Cleaning')[['ModA','ModB']].mean().plot(kind='bar')
plt.title('Effect of Cleaning on Sensor Readings')
plt.show()

In [None]:
#Correlation & Relationship Analysis
corr = df_clean[['GHI','DNI','DHI','TModA','TModB']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Scatter plots
sns.scatterplot(x='RH', y='Tamb', data=df_clean)
sns.scatterplot(x='WS', y='GHI', data=df_clean)


In [None]:
# Wind & Distribution Analysis
sns.histplot(df_clean['GHI'], kde=True)
sns.histplot(df_clean['WS'], kde=True)

In [None]:
#Temperature & Humidity Analysis
sns.scatterplot(x='RH', y='GHI', hue='Tamb', data=df_clean)

In [None]:
#Bubble Chart
plt.scatter(df_clean['Tamb'], df_clean['GHI'], s=df_clean['RH']*2, alpha=0.5)
plt.xlabel('Temperature (Â°C)')
plt.ylabel('GHI (W/mÂ²)')
plt.title('GHI vs Temperature (Bubble size = RH)')
plt.show()


In [None]:
df_clean.to_csv("../data/sierraleone_clean.csv", index=False)
print("âœ… Cleaned file saved to ../data/benin_clean.csv")

### Summary of Findings (Sierra Leone)
- The Sierra Leone dataset revealed clear daily irradiance cycles, though with higher variability due to cloud cover and coastal weather.  
- Outliers were mainly concentrated in WSgust and RH values.  
- After cleaning, data completeness and reliability improved notably.  
- Cleaning operations also enhanced the match between ModA and ModB sensor outputs.  
- Correlation analysis shows a consistent relationship between GHI, DNI, and DHI.  
- The data suggests relative humidity impacts both solar irradiance and temperature, which aligns with expected regional patterns.
