In [10]:
!pip install pymongo sqlalchemy kaggle pandas matplotlib plotly graphviz -q

In [14]:
import os
import pandas as pd
import numpy as np
# from google.colab import files
import json

# Upload your kaggle.json file
print("=" * 60)
print("STEP 1: Upload your kaggle.json file")
print("=" * 60)
print("Go to: https://www.kaggle.com/settings/account")
print("Scroll down to 'API' section and click 'Create New Token'")
print("This will download kaggle.json")
print("\nNow upload it here:")

# uploaded = files.upload()

# Setup Kaggle credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
print("\n" + "=" * 60)
print("STEP 2: Downloading dataset from Kaggle...")
print("=" * 60)

!kaggle datasets download -d patelris/crop-yield-prediction-dataset
!unzip -o crop-yield-prediction-dataset.zip

# Load the dataset
df = pd.read_csv('yield_df.csv')

print("\n✓ Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())

STEP 1: Upload your kaggle.json file
Go to: https://www.kaggle.com/settings/account
Scroll down to 'API' section and click 'Create New Token'
This will download kaggle.json

Now upload it here:

STEP 2: Downloading dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/patelris/crop-yield-prediction-dataset
License(s): world-bank
crop-yield-prediction-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  crop-yield-prediction-dataset.zip
  inflating: pesticides.csv          
  inflating: rainfall.csv            
  inflating: temp.csv                
  inflating: yield.csv               
  inflating: yield_df.csv            

✓ Dataset loaded successfully!
Dataset shape: (28242, 8)

Columns: ['Unnamed: 0', 'Area', 'Item', 'Year', 'hg/ha_yield', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']

First few rows:
   Unnamed: 0     Area         Item  Year  hg/ha_yield  \
0           0  Albania        Maize  1990

In [15]:
# ============================================================================
# PART 3: DATA PREPROCESSING AND SCHEMA DESIGN
# ============================================================================

print("\n" + "=" * 60)
print("STEP 3: Data Preprocessing")
print("=" * 60)

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Handle missing values
df = df.dropna()

# Display basic statistics
print(f"\nDataset Info:")
print(f"Total records: {len(df)}")
print(f"\nColumns and types:")
print(df.dtypes)
print(f"\nBasic statistics:")
print(df.describe())


STEP 3: Data Preprocessing

Dataset Info:
Total records: 28242

Columns and types:
unnamed:_0                         int64
area                              object
item                              object
year                               int64
hg/ha_yield                        int64
average_rain_fall_mm_per_year    float64
pesticides_tonnes                float64
avg_temp                         float64
dtype: object

Basic statistics:
         unnamed:_0          year    hg/ha_yield  \
count  28242.000000  28242.000000   28242.000000   
mean   14120.500000   2001.544296   77053.332094   
std     8152.907488      7.051905   84956.612897   
min        0.000000   1990.000000      50.000000   
25%     7060.250000   1995.000000   19919.250000   
50%    14120.500000   2001.000000   38295.000000   
75%    21180.750000   2008.000000  104676.750000   
max    28241.000000   2013.000000  501412.000000   

       average_rain_fall_mm_per_year  pesticides_tonnes      avg_temp  
count         

In [None]:
# ============================================================================
# PART 4: CREATE RELATIONAL DATABASE (SQLite)
# ============================================================================

print("\n" + "=" * 60)
print("STEP 4: Creating Relational Database (SQLite)")
print("=" * 60)

import sqlite3
from datetime import datetime

# Create connection
conn = sqlite3.connect('crop_yield.db')
cursor = conn.cursor()

# Drop existing tables if they exist
cursor.execute("DROP TABLE IF EXISTS yield_records")
cursor.execute("DROP TABLE IF EXISTS crops")
cursor.execute("DROP TABLE IF EXISTS countries")
cursor.execute("DROP TABLE IF EXISTS audit_log")

# Create Countries Table
cursor.execute("""
CREATE TABLE countries (
    country_id INTEGER PRIMARY KEY AUTOINCREMENT,
    country_name VARCHAR(100) UNIQUE NOT NULL,
    region VARCHAR(50),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create Crops Table
cursor.execute("""
CREATE TABLE crops (
    crop_id INTEGER PRIMARY KEY AUTOINCREMENT,
    crop_name VARCHAR(100) UNIQUE NOT NULL,
    crop_category VARCHAR(50),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")

# Create Yield Records Table
cursor.execute("""
CREATE TABLE yield_records (
    record_id INTEGER PRIMARY KEY AUTOINCREMENT,
    country_id INTEGER NOT NULL,
    crop_id INTEGER NOT NULL,
    year INTEGER NOT NULL,
    avg_temp REAL,
    avg_rainfall_mm REAL,
    pesticides_tonnes REAL,
    yield_hg_per_ha REAL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY (country_id) REFERENCES countries(country_id),
    FOREIGN KEY (crop_id) REFERENCES crops(crop_id),
    CHECK (year >= 1900 AND year <= 2100),
    CHECK (yield_hg_per_ha >= 0)
)
""")

# Create Audit Log Table
cursor.execute("""
CREATE TABLE audit_log (
    log_id INTEGER PRIMARY KEY AUTOINCREMENT,
    table_name VARCHAR(50),
    operation VARCHAR(20),
    record_id INTEGER,
    old_value TEXT,
    new_value TEXT,
    changed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")

print("✓ Tables created successfully!")