In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import zscore
from utils import load_and_clean_data, detect_outliers, plot_time_series, create_wind_rose

# Set style
plt.style.use('seaborn')
sns.set_palette('husl')

# Load & Inspect Data
df = pd.read_csv('../data/togo-dapaong_qc.csv')
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

print("Dataset Info:")
df.info()

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isna().sum())

# Handle missing values and detect outliers
numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())
    
# Detect outliers using Z-score
for col in numeric_cols:
    df[f'{col}_z'] = zscore(df[col])
    outliers = df[abs(df[f'{col}_z']) > 3]
    print(f"\nOutliers in {col}:", len(outliers))

# Time Series Analysis
plt.figure(figsize=(15, 5))
plt.plot(df['Timestamp'], df['GHI'], label='GHI')
plt.plot(df['Timestamp'], df['DNI'], label='DNI')
plt.plot(df['Timestamp'], df['DHI'], label='DHI')
plt.title('Solar Radiation Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Radiation (W/m²)')
plt.legend()
plt.show()

# Daily patterns analysis
daily_avg = df.groupby(df['Timestamp'].dt.hour)[['GHI', 'DNI', 'DHI']].mean()
daily_avg.plot(figsize=(10, 5))
plt.title('Average Daily Pattern')
plt.xlabel('Hour of Day')
plt.ylabel('Radiation (W/m²)')
plt.show()

# Correlation Analysis
corr_matrix = df[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'WS']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Scatter plots for relationships
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

sns.scatterplot(data=df, x='WS', y='GHI', ax=ax1)
ax1.set_title('Wind Speed vs GHI')

sns.scatterplot(data=df, x='Tamb', y='GHI', ax=ax2)
ax2.set_title('Temperature vs GHI')

sns.scatterplot(data=df, x='RH', y='GHI', ax=ax3)
ax3.set_title('Relative Humidity vs GHI')

sns.scatterplot(data=df, x='ModA', y='ModB', ax=ax4)
ax4.set_title('ModA vs ModB')

plt.tight_layout()
plt.show()

# Wind Analysis
# Wind rose plot
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='polar')
ax.scatter(np.radians(df['WD']), df['WS'], alpha=0.5)
plt.title('Wind Rose')
plt.show()

# Wind speed distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='WS', bins=30)
plt.title('Wind Speed Distribution')
plt.show()

# Temperature Analysis
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, y='Tamb')
plt.title('Temperature Distribution')
plt.show()

# Temperature vs Humidity
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Tamb', y='RH')
plt.title('Temperature vs Relative Humidity')
plt.show()

# Monthly patterns
df['Month'] = df['Timestamp'].dt.month
monthly_avg = df.groupby('Month')[['GHI', 'DNI', 'DHI', 'Tamb']].mean()
monthly_avg.plot(figsize=(12, 6))
plt.title('Monthly Averages')
plt.xlabel('Month')
plt.show()

# Save cleaned data
cols_to_drop = [col for col in df.columns if col.endswith('_z')]
df_clean = df.drop(columns=cols_to_drop)
df_clean.to_csv("../data/togo_clean.csv", index=False)
print("Cleaned data saved successfully")

# Add key findings section
print("\nKey Findings:")
print("1. Data completeness and quality metrics")
print("2. Correlation patterns between variables")
print("3. Daily and seasonal patterns")
print("4. Wind and temperature characteristics")

{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# Togo Solar Data Analysis\n',
    '\n',
    '## Data Loading and Cleaning']},
  {'cell_type': 'code',
   'execution_count': 0,
   'metadata': {},
   'source': ['import pandas as pd\n',
    'import numpy as np\n',
    'import matplotlib.pyplot as plt\n',
    'import seaborn as sns\n',
    'from scipy import stats\n',
    'from utils import load_and_clean_data, detect_outliers, plot_time_series, create_wind_rose\n',
    '\n',
    '# Set style\n',
    "plt.style.use('seaborn')\n",
    "sns.set_palette('husl')"]},
  {'cell_type': 'code',
   'execution_count': 0,
   'metadata': {},
   'source': ['# Load data\n',
    "df = pd.read_csv('../data/togo-dapaong_qc.csv')\n",
    "df['Timestamp'] = pd.to_datetime(df['Timestamp'])\n",
    'print(f"Data shape: {df.shape}")\n',
    'df.head()']},
  {'cell_type': 'markdown',
   'metadata': {},
   'source': ['## Data Cleaning and Preprocessing']},
  {'cell_type': 'code',
   'executi