In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np


In [None]:
# Load the dataset (replace this with the actual file names for each country)
df_benin = pd.read_csv('../data/solar_data_benin.csv')  # Change with the correct filename
df_sierra_leone = pd.read_csv('../data/solar_data_sierra_leone.csv')  # Change with the correct filename
df_togo = pd.read_csv('../data/solar_data_togo.csv')  # Change with the correct filename


In [None]:
# Display the first few rows of each dataset to check if it loaded properly
print(df_benin.head())  # Displays the first 5 rows of the Benin dataset
print(df_sierra_leone.head())  # Displays the first 5 rows of the Sierra Leone dataset
print(df_togo.head())  # Displays the first 5 rows of the Togo dataset


In [None]:
# Check for missing values in each dataset
print("Benin Dataset Missing Values:")
print(df_benin.isnull().sum())  # Shows the count of missing values for each column in Benin

print("\nSierra Leone Dataset Missing Values:")
print(df_sierra_leone.isnull().sum())  # Shows the count of missing values for each column in Sierra Leone

print("\nTogo Dataset Missing Values:")
print(df_togo.isnull().sum())  # Shows the count of missing values for each column in Togo


In [None]:
# Display summary statistics for each dataset
print("Benin Dataset Summary Statistics:")
print(df_benin.describe())  # Summary statistics for Benin

print("\nSierra Leone Dataset Summary Statistics:")
print(df_sierra_leone.describe())  # Summary statistics for Sierra Leone

print("\nTogo Dataset Summary Statistics:")
print(df_togo.describe())  # Summary statistics for Togo


In [None]:
import matplotlib.pyplot as plt

# Plot GHI, DNI, and DHI over time for Benin
plt.figure(figsize=(10, 6))
plt.plot(df_benin['Timestamp'], df_benin['GHI'], label='GHI')
plt.plot(df_benin['Timestamp'], df_benin['DNI'], label='DNI')
plt.plot(df_benin['Timestamp'], df_benin['DHI'], label='DHI')
plt.xlabel('Time')
plt.ylabel('Solar Irradiance (W/m²)')
plt.title('Solar Radiation over Time (Benin)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# Correlation matrix for Benin
corr = df_benin[['GHI', 'DNI', 'DHI', 'Tamb', 'WS']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (Benin)')
plt.show()


In [None]:
# Fill missing values with the median
df_benin.fillna(df_benin.median(), inplace=True)

# Verify that missing values are filled
print(df_benin.isnull().sum())


In [None]:
# Remove rows where GHI is less than 0 (invalid data)
df_benin = df_benin[df_benin['GHI'] >= 0]

# Verify the changes
print(df_benin.head())


In [None]:
# Save the cleaned data to a new CSV file
df_benin.to_csv('../data/cleaned_solar_data_benin.csv', index=False)


In [None]:
# Check for missing values in Sierra Leone dataset
print("Sierra Leone Dataset Missing Values:")
print(df_sierra_leone.isnull().sum())  # Shows the count of missing values for each column

# Check for missing values in Togo dataset
print("\nTogo Dataset Missing Values:")
print(df_togo.isnull().sum())  # Shows the count of missing values for each column


In [None]:
# Display summary statistics for Sierra Leone dataset
print("Sierra Leone Dataset Summary Statistics:")
print(df_sierra_leone.describe())  # Summary statistics for Sierra Leone

# Display summary statistics for Togo dataset
print("\nTogo Dataset Summary Statistics:")
print(df_togo.describe())  # Summary statistics for Togo


In [None]:
import matplotlib.pyplot as plt

# Plot GHI, DNI, and DHI over time for Sierra Leone
plt.figure(figsize=(10, 6))
plt.plot(df_sierra_leone['Timestamp'], df_sierra_leone['GHI'], label='GHI')
plt.plot(df_sierra_leone['Timestamp'], df_sierra_leone['DNI'], label='DNI')
plt.plot(df_sierra_leone['Timestamp'], df_sierra_leone['DHI'], label='DHI')
plt.xlabel('Time')
plt.ylabel('Solar Irradiance (W/m²)')
plt.title('Solar Radiation over Time (Sierra Leone)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot GHI, DNI, and DHI over time for Togo
plt.figure(figsize=(10, 6))
plt.plot(df_togo['Timestamp'], df_togo['GHI'], label='GHI')
plt.plot(df_togo['Timestamp'], df_togo['DNI'], label='DNI')
plt.plot(df_togo['Timestamp'], df_togo['DHI'], label='DHI')
plt.xlabel('Time')
plt.ylabel('Solar Irradiance (W/m²)')
plt.title('Solar Radiation over Time (Togo)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# Correlation matrix for Sierra Leone
corr_sierra_leone = df_sierra_leone[['GHI', 'DNI', 'DHI', 'Tamb', 'WS']].corr()
sns.heatmap(corr_sierra_leone, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (Sierra Leone)')
plt.show()

# Correlation matrix for Togo
corr_togo = df_togo[['GHI', 'DNI', 'DHI', 'Tamb', 'WS']].corr()
sns.heatmap(corr_togo, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (Togo)')
plt.show()


In [None]:
# Fill missing values with the median for Sierra Leone
df_sierra_leone.fillna(df_sierra_leone.median(), inplace=True)

# Fill missing values with the median for Togo
df_togo.fillna(df_togo.median(), inplace=True)

# Verify that missing values are filled
print("Sierra Leone Dataset Missing Values After Filling:")
print(df_sierra_leone.isnull().sum())

print("\nTogo Dataset Missing Values After Filling:")
print(df_togo.isnull().sum())


In [None]:
# Remove rows where GHI is less than 0 in Sierra Leone
df_sierra_leone = df_sierra_leone[df_sierra_leone['GHI'] >= 0]

# Remove rows where GHI is less than 0 in Togo
df_togo = df_togo[df_togo['GHI'] >= 0]

# Verify the changes
print("Sierra Leone Dataset After Removing Invalid Data:")
print(df_sierra_leone.head())

print("\nTogo Dataset After Removing Invalid Data:")
print(df_togo.head())


In [None]:
# Save the cleaned Sierra Leone dataset
df_sierra_leone.to_csv('../data/cleaned_solar_data_sierra_leone.csv', index=False)

# Save the cleaned Togo dataset
df_togo.to_csv('../data/cleaned_solar_data_togo.csv', index=False)


In [None]:
# Add a 'Country' column to each dataset
df_benin['Country'] = 'Benin'
df_sierra_leone['Country'] = 'Sierra Leone'
df_togo['Country'] = 'Togo'

# Merge all datasets into one DataFrame
df_all = pd.concat([df_benin, df_sierra_leone, df_togo], ignore_index=True)

# Display the first few rows of the combined dataset
print(df_all.head())


In [None]:
# Display the first few rows of the merged dataset
print(df_all.head())

# Check how many rows are in the merged dataset
print("\nTotal rows in the merged dataset:", len(df_all))

# Count the number of rows per country
print("\nNumber of rows per country:")
print(df_all['Country'].value_counts())


In [None]:
# Calculate the mean GHI for each country
mean_ghi = df_all.groupby('Country')['GHI'].mean()
print("Average GHI by Country:")
print(mean_ghi)

# Plot a bar chart for average GHI by country
mean_ghi.plot(kind='bar', title='Average GHI by Country', ylabel='GHI (W/m²)', xlabel='Country', color=['blue', 'orange', 'green'])
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# Box plot of GHI by Country
sns.boxplot(x='Country', y='GHI', data=df_all)
plt.title('Distribution of GHI by Country')
plt.ylabel('GHI (W/m²)')
plt.xlabel('Country')
plt.show()


In [None]:
# Save the merged dataset to a new CSV file
df_all.to_csv('../data/final_solar_data.csv', index=False)

print("Final merged dataset saved as 'final_solar_data.csv'.")


In [None]:
# Plot time series for GHI, DNI, and DHI over time for the combined dataset
plt.figure(figsize=(12, 6))
plt.plot(df_all['Timestamp'], df_all['GHI'], label='GHI')
plt.plot(df_all['Timestamp'], df_all['DNI'], label='DNI')
plt.plot(df_all['Timestamp'], df_all['DHI'], label='DHI')
plt.xlabel('Timestamp')
plt.ylabel('Solar Irradiance (W/m²)')
plt.title('Time Series of Solar Radiation (All Countries)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix for all numeric variables
corr_matrix = df_all[['GHI', 'DNI', 'DHI', 'Tamb', 'WS']].corr()

# Plot a heatmap for the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (All Countries)')
plt.show()


In [None]:
# Wind analysis: Visualize wind speed (WS) and direction (WD)
sns.scatterplot(x='WD', y='WS', hue='Country', data=df_all)
plt.title('Wind Speed vs. Wind Direction (All Countries)')
plt.xlabel('Wind Direction (°)')
plt.ylabel('Wind Speed (m/s)')
plt.legend(title='Country')
plt.show()


In [None]:
# Plot temperature (Tamb) vs. relative humidity (RH)
sns.scatterplot(x='RH', y='Tamb', hue='Country', data=df_all)
plt.title('Temperature vs. Relative Humidity (All Countries)')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Ambient Temperature (°C)')
plt.legend(title='Country')
plt.show()


In [None]:
# Histogram for GHI
plt.hist(df_all['GHI'], bins=30, alpha=0.7, label='GHI')
plt.title('Histogram of GHI')
plt.xlabel('GHI (W/m²)')
plt.ylabel('Frequency')
plt.show()

# Histogram for DNI
plt.hist(df_all['DNI'], bins=30, alpha=0.7, label='DNI', color='orange')
plt.title('Histogram of DNI')
plt.xlabel('DNI (W/m²)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Save the last plot as an image file
plt.savefig('../data/ghi_time_series.png', dpi=300, bbox_inches='tight')


In [None]:
# Calculate overall averages for key variables
overall_means = df_all.groupby('Country')[['GHI', 'DNI', 'DHI', 'Tamb', 'WS']].mean()
print("Average values by country:")
print(overall_means)

# Save the summary table to a CSV file
overall_means.to_csv('../data/summary_statistics.csv', index=True)
print("Summary statistics saved as 'summary_statistics.csv'")


In [None]:
# Save the cleaned and merged dataset to a CSV file
df_all.to_csv('../data/final_cleaned_solar_data.csv', index=False)
print("Final merged dataset saved as 'final_cleaned_solar_data.csv'")


In [None]:
# Display the first few rows of the merged dataset
print("First few rows of the merged dataset:")
print(df_all.head())

# Check the total number of rows and rows per country
print("\nTotal rows in the merged dataset:", len(df_all))
print("\nNumber of rows per country:")
print(df_all['Country'].value_counts())


In [None]:
# Plot a bar chart for average GHI by country
mean_ghi = df_all.groupby('Country')['GHI'].mean()
mean_ghi.plot(kind='bar', title='Average GHI by Country', ylabel='GHI (W/m²)', xlabel='Country', color=['blue', 'orange', 'green'])
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# Box plot of GHI by Country
sns.boxplot(x='Country', y='GHI', data=df_all)
plt.title('Distribution of GHI by Country')
plt.ylabel('GHI (W/m²)')
plt.xlabel('Country')
plt.show()


In [None]:
# Scatter plot of Wind Speed vs. Wind Direction
sns.scatterplot(x='WD', y='WS', hue='Country', data=df_all)
plt.title('Wind Speed vs. Wind Direction (All Countries)')
plt.xlabel('Wind Direction (°)')
plt.ylabel('Wind Speed (m/s)')
plt.legend(title='Country')
plt.show()
