# Tunisia Olive Oil Dataset Creation

This notebook generates a synthetic dataset for olive oil exports from Tunisia to 42 countries over 180 months (2010-2024).

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os

np.random.seed(42)
random.seed(42)

print("🚀 Creating CORRECTED dataset (7,560 rows)...")

# 1. Correct dates (2010-2024 = 180 months only)
start_date = datetime(2010, 1, 15)
dates = [start_date + timedelta(days=30*i) for i in range(180)]

# 2. The 42 countries
countries = [
    'Tunisia', 'France', 'Canada', 'USA', 'Italy', 'Germany', 'UK', 'Spain', 
    'Belgium', 'Netherlands', 'Morocco', 'Algeria', 'Libya', 'Japan', 'China',
    'Australia', 'UAE', 'Saudi Arabia', 'Qatar', 'Kuwait', 'Egypt', 'Jordan',
    'Lebanon', 'Turkey', 'Greece', 'Portugal', 'Sweden', 'Norway', 'Denmark',
    'Austria', 'Switzerland', 'Poland', 'Czech Republic', 'Hungary', 'Romania',
    'South Africa', 'Brazil', 'Mexico', 'Argentina', 'Chile', 'Peru', 'Colombia'
]

# 3. Season function
def get_season(month):
    if month in [3, 4]: return 'Ramadan'
    elif month in [6, 7, 8]: return 'Summer'
    elif month in [12, 1, 2]: return 'Winter'
    else: return 'Spring'

# 4. Create the correct data (42 × 180 = 7,560 rows only!)
data = []
for country in countries:
    for date in dates:  # No extra loop!
        # Realistic quantities by country
        if country in ['France', 'USA', 'Canada', 'Italy']:
            export = random.randint(20000, 80000)  # Large countries
        elif country in ['China', 'Germany', 'UK']:
            export = random.randint(15000, 60000)
        else:
            export = random.randint(5000, 40000)  # Small countries
        
        row = {
            'Date': date.strftime('%Y-%m-%d'),
            'Country': country,
            'Production_Tons': random.randint(150000, 350000),
            'Export_Tons': export,
            'USD_Price': round(random.uniform(5.5, 18.5), 2),
            'Month': date.month,
            'Year': date.year,
            'Season': get_season(date.month)
        }
        data.append(row)

df = pd.DataFrame(data).sample(frac=1).reset_index(drop=True)

# 5. Save
os.makedirs('../data/raw', exist_ok=True)
df.to_csv('../data/raw/tunisia_olive_oil_dataset.csv', index=False)

print(f"✅ CORRECTED Dataset: {len(df):,} rows | {df['Country'].nunique()} countries")
print("\n📊 First 5 rows:")
print(df[['Date', 'Country', 'Export_Tons', 'USD_Price', 'Season']].head())
print(f"\n💾 Saved: ../data/raw/tunisia_olive_oil_dataset.csv")
print("\n🎉 NOW RUN 02_eda.ipynb AGAIN!")

🚀 Creating CORRECTED dataset (7,560 rows)...
✅ CORRECTED Dataset: 7,560 rows | 42 countries

📊 First 5 rows:
         Date    Country  Export_Tons  USD_Price  Season
0  2013-06-28  Argentina        29327      10.46  Summer
1  2019-08-26    Romania        37759      13.41  Summer
2  2016-11-09     France        61251       6.20  Spring
3  2022-09-09        USA        52936       7.69  Spring
4  2018-08-01      Libya        27286      16.44  Summer

💾 Saved: ../data/raw/tunisia_olive_oil_dataset.csv

🎉 NOW RUN 02_eda.ipynb AGAIN!
✅ CORRECTED Dataset: 7,560 rows | 42 countries

📊 First 5 rows:
         Date    Country  Export_Tons  USD_Price  Season
0  2013-06-28  Argentina        29327      10.46  Summer
1  2019-08-26    Romania        37759      13.41  Summer
2  2016-11-09     France        61251       6.20  Spring
3  2022-09-09        USA        52936       7.69  Spring
4  2018-08-01      Libya        27286      16.44  Summer

💾 Saved: ../data/raw/tunisia_olive_oil_dataset.csv

🎉 NOW R