# Task 2: Data Profiling, Cleaning & EDA
**Objective:** Profile, clean, and explore the solar dataset for Benin so itâ€™s ready for comparison and region-ranking tasks.

This notebook includes:
- Summary statistics and missing-value report
- Outlier detection and cleaning
- Time series analysis
- Correlation and scatter plots
- Wind and temperature analysis
- Bubble charts

## Importing the dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## Loading the Data

In [None]:
# Set both plotting and display settings
import pandas as pd

# Load dataset
df = pd.read_csv(r"D:\Python\Week_01\data\data\benin-malanville.csv")

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', 10)

In [None]:
# Show basic info
print(f"Dataset shape: {df.shape}")
df.info()

In [None]:
#  Display column names
print("\nColumn names:")
print(df.columns.tolist())

In [None]:
#  Display the first 5 rows
print("\nFirst 5 rows:")
display(df.head())

In [None]:
# Display the last 10 rows
print("\nLast 10 rows:")
display(df.tail(10))

In [None]:
# Display 10 random sample rows
print("\nRandom sample of 10 rows:")
display(df.sample(10, random_state=42))

In [None]:
# Check for missing values
print("\nMissing values per column:")
print(df.isna().sum())

# Percentage of missing values per column
print("\nPercentage of missing values:")
print((df.isna().mean() * 100).round(2))

# Show only columns with any missing values
missing = df.isna().sum()
missing = missing[missing > 0]
print("\nColumns with missing values:")
print(missing)




In [None]:
# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isna(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Summary statistics for numeric columns
print("\nSummary statistics for numeric columns:")
display(df.describe())

In [None]:
# Show description of all categorical (non-numeric) columns
cat_description = df.describe(include=['object', 'category'])
display(cat_description)

In [None]:
# Show description of all numerical and categorical columns
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
numeric_summary = df.describe()
display(numeric_summary.T)  # Transposed for readability
full_summary = df.describe(include='all')
display(full_summary.T)

df.describe(include='all')

In [None]:
# Display all rows that are exact duplicates
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")
display(duplicate_rows)

## Univariate Annalysis

In [None]:
# Univariate Analysis Numeric columns only
numeric_cols = df.select_dtypes(include=np.number).columns
print(df[numeric_cols].describe().T)

# Numeric columns only
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    # Skip if column has no valid numeric data
    if df[col].dropna().nunique() > 1:
        plt.figure(figsize=(12,4))

        # Histogram
        plt.subplot(1, 2, 1)
        sns.histplot(df[col].dropna(), bins=30, kde=True)
        plt.title(f'Histogram of {col}')

        # Boxplot
        plt.subplot(1, 2, 2)
        sns.boxplot(y=df[col].dropna())
        plt.title(f'Boxplot of {col}')

        plt.tight_layout()
        plt.show()