## 1Ô∏è‚É£ Import Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from src.data_loader import load_raw_data, get_data_info

print("‚úì Libraries imported successfully")

## 2Ô∏è‚É£ Load Dataset

In [None]:
# Load the raw data
df = load_raw_data()

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

## 3Ô∏è‚É£ Basic Information

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Data types and info
df.info()

In [None]:
# Get comprehensive data info
info = get_data_info(df)

print("=" * 60)
print("DATASET INFORMATION")
print("=" * 60)
print(f"\nShape: {info['shape']}")
print(f"\nColumns: {', '.join(info['columns'])}")
print(f"\nDate Range: {info['date_range']['min']} to {info['date_range']['max']}")
print(f"\nNumber of Countries: {info['countries']}")
print(f"\nMissing Values:")
for col, count in info['missing_values'].items():
    if count > 0:
        print(f"  {col}: {count}")

## 4Ô∏è‚É£ Descriptive Statistics

In [None]:
# Numerical statistics
df.describe()

In [None]:
# Categorical statistics
print("Unique Countries:", df['Country'].nunique())
print("\nTop 10 Countries by Record Count:")
print(df['Country'].value_counts().head(10))

In [None]:
# Season distribution
print("Season Distribution:")
print(df['Season'].value_counts())

## 5Ô∏è‚É£ Distribution Visualizations

In [None]:
# Production Distribution
fig = px.histogram(
    df, 
    x='Production_Tons',
    nbins=50,
    title='Production Distribution',
    labels={'Production_Tons': 'Production (Tons)'},
    color_discrete_sequence=['#2E7D32']
)
fig.show()

In [None]:
# Export Distribution
fig = px.histogram(
    df, 
    x='Export_Tons',
    nbins=50,
    title='Export Distribution',
    labels={'Export_Tons': 'Export (Tons)'},
    color_discrete_sequence=['#1976D2']
)
fig.show()

In [None]:
# Price Distribution
fig = px.histogram(
    df, 
    x='USD_Price',
    nbins=50,
    title='USD Price Distribution',
    labels={'USD_Price': 'Price (USD)'},
    color_discrete_sequence=['#F57C00']
)
fig.show()

## 6Ô∏è‚É£ Time Series Analysis

In [None]:
# Production over time
df_sorted = df.sort_values('Date')

fig = px.line(
    df_sorted,
    x='Date',
    y='Production_Tons',
    title='Production Over Time',
    labels={'Production_Tons': 'Production (Tons)', 'Date': 'Date'}
)
fig.show()

In [None]:
# Monthly aggregated production
df['Date'] = pd.to_datetime(df['Date'])
monthly_production = df.groupby(df['Date'].dt.to_period('M'))['Production_Tons'].mean()
monthly_production.index = monthly_production.index.to_timestamp()

fig = px.line(
    x=monthly_production.index,
    y=monthly_production.values,
    title='Average Monthly Production',
    labels={'x': 'Month', 'y': 'Avg Production (Tons)'}
)
fig.show()

## 7Ô∏è‚É£ Country-wise Analysis

In [None]:
# Top 10 countries by average production
top_countries = df.groupby('Country')['Production_Tons'].mean().sort_values(ascending=False).head(10)

fig = px.bar(
    x=top_countries.index,
    y=top_countries.values,
    title='Top 10 Countries by Average Production',
    labels={'x': 'Country', 'y': 'Avg Production (Tons)'},
    color=top_countries.values,
    color_continuous_scale='Greens'
)
fig.show()

In [None]:
# Production by selected countries over time
selected_countries = ['Italy', 'Spain', 'Greece', 'Turkey', 'Tunisia']
df_selected = df[df['Country'].isin(selected_countries)].copy()

fig = px.line(
    df_selected.sort_values('Date'),
    x='Date',
    y='Production_Tons',
    color='Country',
    title='Production Trends by Major Producers',
    labels={'Production_Tons': 'Production (Tons)'}
)
fig.show()

## 8Ô∏è‚É£ Correlation Analysis

In [None]:
# Correlation heatmap
numeric_cols = ['Production_Tons', 'Export_Tons', 'USD_Price', 'Month', 'Year']
corr_matrix = df[numeric_cols].corr()

fig = px.imshow(
    corr_matrix,
    text_auto='.2f',
    title='Correlation Heatmap',
    color_continuous_scale='RdBu_r',
    aspect='auto'
)
fig.show()

In [None]:
# Production vs Export scatter
fig = px.scatter(
    df,
    x='Production_Tons',
    y='Export_Tons',
    color='Country',
    title='Production vs Export Volume',
    labels={'Production_Tons': 'Production (Tons)', 'Export_Tons': 'Export (Tons)'},
    opacity=0.6
)
fig.show()

## 9Ô∏è‚É£ Seasonal Analysis

In [None]:
# Production by season
season_production = df.groupby('Season')['Production_Tons'].mean().sort_values(ascending=False)

fig = px.bar(
    x=season_production.index,
    y=season_production.values,
    title='Average Production by Season',
    labels={'x': 'Season', 'y': 'Avg Production (Tons)'},
    color=season_production.values,
    color_continuous_scale='Viridis'
)
fig.show()

In [None]:
# Price by season box plot
fig = px.box(
    df,
    x='Season',
    y='USD_Price',
    title='Price Distribution by Season',
    labels={'USD_Price': 'Price (USD)'},
    color='Season'
)
fig.show()

## üìù Key Insights

**From this exploration, we can observe:**

1. **Data Quality**: The dataset contains comprehensive records across multiple countries and years
2. **Production Patterns**: Clear seasonal variations in olive oil production
3. **Country Distribution**: Major producers include Italy, Spain, Greece, Turkey, and Tunisia
4. **Correlations**: Strong relationships between production, exports, and pricing
5. **Temporal Trends**: Long-term trends visible in the time series data

**Next Steps:**
- Preprocessing and feature engineering (Notebook 02)
- Model training and evaluation (Notebook 03)