<a href="https://colab.research.google.com/github/Harish34272/Adaptive_feed/blob/sub-branch/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from joblib import load
import joblib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
BASE_DIR = Path("/content/drive/MyDrive/adaptive-feeding/combined_csv")
OUT_DIR  = BASE_DIR / "preprocessing"
OUT_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
COST_COL = "Cost/kg"
ANIMALS = {
    "Fish": {
        "energy_cols": ["DE salmonids kcal/kg", "DE salmonids MJ/kg"],
        "percent_cols": ["ND salmonids %", "ED salmonids %"],
        "mass_cols": []
    },
    "Pig": {
        "energy_cols": [
            "DE growing pig kcal/kg", "ME growing pig kcal/kg", "NE growing pig kcal/kg",
            "DE adult pig kcal/kg",   "ME adult pig kcal/kg",   "NE adult pig kcal/kg",
            "DE growing pig MJ/kg"
        ],
        "percent_cols": [],
        "mass_cols": []
    },
    "Poultry": {
        "energy_cols": [
            "AMEn cockerel kcal/kg", "AMEn broiler kcal/kg",
            "AMEn cockerel MJ/kg",   "AMEn broiler MJ/kg",
            "AMEn cockerel ground kcal/kg" , "AMEn cockerel pelleted kcal/kg",
            "AMEn broiler ground kcal/kg" , "AMEn broiler pelleted kcal/kg"
        ],
        "percent_cols": [],
        "mass_cols": ["Avail. P cockerel g/kg", "Avail. P broiler g/kg"]

    },
    "Rabbit": {
        "energy_cols": [
            "DE (kcal) kcal/kg", "ME rabbit kcal/kg",
            "DE rabbit MJ/kg",   "ME rabbit MJ/kg"
        ],
        "percent_cols": ["ND rabbit %", "ED rabbit %"],
        "mass_cols": []
    },
    }


In [None]:
MJ_TO_KCAL = 239.005736
Z_DETECT = 2
Z_CLIP = 2
TEST_SIZE = 0.1
VAL_SIZE = 0.1
TRAIN_SIZE = 0.8
RNG = 42

In [None]:
dirs = [OUT_DIR, OUT_DIR / "train", OUT_DIR / "test" , OUT_DIR/"val" , OUT_DIR/"scalers"]
for d in dirs:
    d.mkdir(exist_ok=True, parents=True)

In [None]:
def preprocess(df:pd.DataFrame , energy_cols , percent_cols , mass_cols , cost_col):
    feat = df[energy_cols + percent_cols +mass_cols+ [cost_col]].copy()

    #converting MJ to KCAL
    mj_cols = [c for c in energy_cols if "MJ" in c]
    feat[mj_cols] = feat[mj_cols] * MJ_TO_KCAL

    #convert percent to decimals(0-1)
    feat[percent_cols] = feat[percent_cols]/100.0

    mask = feat.notnull().astype(int)
    mask_cols = [f"{c}_mask" for c in feat.columns]
    mask.columns = mask_cols
    feat = feat.fillna(0)

    energy_pipe = Pipeline([
        ('log1p', FunctionTransformer(np.log1p, validate=False)),
        ('Standard Scaler', StandardScaler())
    ])
    feat[energy_cols] = energy_pipe.fit_transform(feat[energy_cols])

    mass_pipe = None
    if mass_cols:
        mass_pipe = Pipeline([('Standard Scaler', StandardScaler())])
        feat[mass_cols] = mass_pipe.fit_transform(feat[mass_cols])

    cost_pipe = Pipeline([
        ('log1p', FunctionTransformer(np.log1p, validate=False)),
        ('Standard Scaler', StandardScaler())
    ])
    feat[cost_col] = cost_pipe.fit_transform(feat[[cost_col]])
    out = pd.concat([feat , mask] , axis = 1)
    scalers = {'energy': energy_pipe, 'mass': mass_pipe, 'cost': cost_pipe}
    return out, scalers

In [None]:
all_data = {}
for animal, specs in ANIMALS.items():
    dfs = []
    for csv_file in sorted(BASE_DIR.glob("*_combined1.csv")):
        df = pd.read_csv(csv_file)
        df.columns = df.columns.str.strip()
        df = df[df["Animal"] == animal]
        dfs.append(df)
    full_df = pd.concat(dfs, ignore_index=True)

    # Preprocess features
    X_scaled, scalers = preprocess(
        full_df, specs["energy_cols"], specs["percent_cols"], specs["mass_cols"] , COST_COL
    )
    # Replace raw with scaled for selected cols
    cols_to_replace = specs["energy_cols"] + specs["percent_cols"] + specs["mass_cols"] + [COST_COL]
    full_df.loc[:, cols_to_replace] = X_scaled[cols_to_replace]


    # Select final columns
    cols = ["Animal", "Feed Category", "Feed"] + specs["energy_cols"] + specs["percent_cols"] + specs["mass_cols"] + [COST_COL]
    processed_df = full_df[cols]
    all_data[animal] = processed_df

    # Save processed
    processed_df.to_csv(OUT_DIR / f"{animal}_processed.csv", index=False)
    print(f"Saved processed: {animal} ({len(processed_df)} rows)")


Saved processed: Fish (153 rows)
Saved processed: Pig (125 rows)
Saved processed: Poultry (118 rows)
Saved processed: Rabbit (125 rows)


In [None]:
from sklearn.model_selection import train_test_split

def safe_stratified_split(df, strat_col, test_size, random_state):
    vc = df[strat_col].value_counts()
    if (vc < 2).any():
        # too few for proper stratify: do random split
        return train_test_split(df, test_size=test_size,
                                random_state=random_state,
                                shuffle=True)
    else:
        return train_test_split(df, test_size=test_size,
                                random_state=random_state,
                                stratify=df[strat_col])


In [None]:
from sklearn.model_selection import train_test_split

for animal, df in all_data.items():
    stratify_col = df['Feed Category']  if df['Feed Category'].nunique() > 1 else None

    # First split: train (80%) vs temp (20%)
    train_df, temp_df = safe_stratified_split(
        df, 'Feed Category', test_size=TEST_SIZE+VAL_SIZE, random_state=RNG
    )
    val_df, test_df = safe_stratified_split(
        temp_df, 'Feed Category', test_size=0.5, random_state=RNG
    )


    # Reset indices
    train_df.reset_index(drop=True, inplace=True)
    val_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)

    # Save splits
    train_df.to_csv(OUT_DIR / "train" / f"{animal}_train.csv", index=False)
    val_df.to_csv(OUT_DIR / "val" / f"{animal}_val.csv", index=False)
    test_df.to_csv(OUT_DIR / "test" / f"{animal}_test.csv", index=False)
    joblib.dump(scalers, OUT_DIR / "scalers" / f"{animal}_scalers.pkl")
    print(f"{animal}: {len(train_df)} train / {len(val_df)} val / {len(test_df)} test")


Fish: 122 train / 15 val / 16 test
Pig: 100 train / 12 val / 13 test
Poultry: 94 train / 12 val / 12 test
Rabbit: 100 train / 12 val / 13 test
