# Auto Notebook for ARFF

In [None]:

# Imports & Setup
import os, re, io, json
import numpy as np
import pandas as pd

try:
    from scipy.io import arff as scipy_arff
    HAVE_SCIPY = True
except Exception as e:
    HAVE_SCIPY = False

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

DATA_PATH = r"/mnt/data/air_quality_realtime_large (1).arff"
TARGET_FALLBACK = None  # set a column name here to override
print("Data path:", DATA_PATH)
print("SciPy available:", HAVE_SCIPY)


In [None]:
print('Hello from notebook')

In [None]:

def parse_arff_lightweight(text):
    """
    Very lightweight ARFF parser that handles:
      - % comments
      - @relation
      - @attribute name type
      - @data (CSV-like rows)
    Nominal attributes like {a,b,c} are treated as 'category' dtype later.
    Returns (df, attributes), where attributes is list of (name, type_str).
    """
    import re
    lines = [ln.strip() for ln in text.splitlines()]
    attrs = []
    data_lines = []
    in_data = False
    for ln in lines:
        if not ln or ln.startswith('%'):
            continue
        low = ln.lower()
        if low.startswith('@relation'):
            continue
        if low.startswith('@attribute'):
            m = re.match(r"(?i)@attribute\s+'?([^'\s]+[^']?)'?\s+(.+)", ln)
            if m:
                name = m.group(1)
                typ = m.group(2).strip()
                attrs.append((name, typ))
            continue
        if low.startswith('@data'):
            in_data = True
            continue
        if in_data:
            if ln:
                data_lines.append(ln)

    rows = []
    for dl in data_lines:
        parts = []
        buff = ''
        in_quote = False
        i = 0
        while i < len(dl):
            ch = dl[i]
            if ch == "'":
                in_quote = not in_quote
                buff += ch
            elif ch == "," and not in_quote:
                parts.append(buff.strip())
                buff = ''
            else:
                buff += ch
            i += 1
        parts.append(buff.strip())
        rows.append(parts)

    colnames = [a[0] for a in attrs]
    import pandas as pd
    import numpy as np
    df = pd.DataFrame(rows, columns=colnames)

    for c, (_, t) in zip(df.columns, attrs):
        df[c] = df[c].replace("?", np.nan)
        df[c] = df[c].astype(str).str.strip()
        df[c] = df[c].str.replace(r"^'(.*)'$", r"\1", regex=True)
        if re.match(r"(?i)numeric|real|integer", t):
            df[c] = pd.to_numeric(df[c], errors="coerce")

    return df, attrs


In [None]:

def load_arff_to_df(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    if 'scipy_arff' in globals() and HAVE_SCIPY:
        try:
            data, meta = scipy_arff.loadarff(io.StringIO(text))
            df = pd.DataFrame(data)
            for col in df.columns:
                if df[col].dtype == object:
                    df[col] = df[col].apply(lambda x: x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else x)
            attrs = [(name, str(meta[name])) for name in df.columns]
            return df, attrs
        except Exception as e:
            print("SciPy ARFF load failed, falling back. Error:", e)

    df, attrs = parse_arff_lightweight(text)
    return df, attrs

df, ATTRS = load_arff_to_df(DATA_PATH)
print("Shape:", df.shape)
print("Columns (first 10):", list(df.columns)[:10], "..." if df.shape[1] > 10 else "")
df.head()


In [None]:

# Target detection and basic EDA
cols_lower = {c.lower(): c for c in df.columns}
if "class" in cols_lower:
    target_col = cols_lower["class"]
else:
    target_col = df.columns[-1] if len(df.columns) else None

if TARGET_FALLBACK and TARGET_FALLBACK in df.columns:
    target_col = TARGET_FALLBACK

print("Target column selected:", target_col)

display(df.head(10))
display(df.describe(include="all"))
print("Missing values (top 20):")
print(df.isna().sum().sort_values(ascending=False).head(20))


In [None]:

# Train/test split and baseline model
import pandas as pd

if target_col is None:
    raise ValueError("Could not determine a target column. Please set `target_col` manually.")

X = df.drop(columns=[target_col]).copy()
y = df[target_col].copy()

is_numeric_target = pd.api.types.is_numeric_dtype(y)
n_unique = y.nunique(dropna=True)
task = "classification"
if is_numeric_target and n_unique > 15:
    task = "regression"
print(f"Detected task: {task} (unique target values: {n_unique})")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=y if task=="classification" and n_unique>1 else None
)

num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if c not in num_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

if task == "classification":
    model = RandomForestClassifier(n_estimators=200, random_state=42)
else:
    model = RandomForestRegressor(n_estimators=200, random_state=42)

pipe = Pipeline(steps=[("prep", preprocess), ("model", model)])
pipe.fit(X_train, y_train)

if task == "classification":
    pred = pipe.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, pred))
    print("F1 (macro):", f1_score(y_test, pred, average="macro"))
    print("\nClassification report:\n", classification_report(y_test, pred))
else:
    pred = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, pred, squared=False)
    r2 = r2_score(y_test, pred)
    print("RMSE:", rmse)
    print("R^2:", r2)


In [None]:

# Optional: permutation importance
try:
    from sklearn.inspection import permutation_importance
    r = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42, scoring=None)
    cat_encoder = pipe.named_steps["prep"].transformers_[1][1]
    cat_feature_names = list(cat_encoder.get_feature_names_out(cat_cols)) if len(cat_cols) else []
    feature_names = num_cols + cat_feature_names
    import pandas as pd
    importances = pd.Series(r.importances_mean, index=feature_names)
    display(importances.sort_values(ascending=False).head(20))
except Exception as e:
    print("Permutation importance skipped:", e)


In [None]:

# Save a raw CSV copy (optional)
clean_csv_path = "/mnt/data/air_quality_from_arff_raw.csv"
df.to_csv(clean_csv_path, index=False)
print("Saved raw CSV to:", clean_csv_path)


## Next steps\n- Adjust `target_col` if auto-detection isn't correct.\n- Tune model hyperparameters.\n- Add domain-specific feature engineering.\n- Try cross-validation and additional algorithms.\n