<a href="https://colab.research.google.com/github/Gunasree4/climate-impact-on-crop-disease/blob/main/Crop_disease_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install (optional) and imports
!pip install -q pandas matplotlib seaborn scikit-learn openpyxl

import os, sys, math, zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             roc_auc_score, roc_curve, mean_squared_error, r2_score)
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

import joblib
from IPython.display import HTML, display
sns.set(style="whitegrid")


In [None]:
# Cell 2: Helper functions for EDA, preprocessing and modelling

RESULTS_DIR = "/content/project_results"
os.makedirs(RESULTS_DIR, exist_ok=True)

def save_fig(fig, name):
    path = os.path.join(RESULTS_DIR, name)
    fig.savefig(path, bbox_inches='tight')
    plt.close(fig)
    return path

def quick_stats(df):
    stats = df.describe(include='all').transpose()
    stats.to_csv(os.path.join(RESULTS_DIR, "data_summary.csv"))
    return stats

def detect_target(df):
    # Heuristic: common names first, else last column
    common_names = ['target','label','y','outcome','class','disease_risk','Disease_Risk']
    for name in common_names:
        if name in df.columns:
            return name
    # if dataset has a boolean/int column with only 0/1, prefer it
    for col in df.columns[::-1]:
        if df[col].dropna().isin([0,1]).all() and df[col].nunique()<=2:
            return col
    # else choose last column
    return df.columns[-1]

def task_type_from_target(series):
    # classification if few unique values or dtype is object/categorical
    if series.dtype == 'object' or series.dtype.name.startswith('category'):
        return 'classification'
    nunique = series.dropna().nunique()
    if nunique <= 20:
        return 'classification'
    return 'regression'

def plot_missing(df):
    miss = df.isnull().mean().sort_values(ascending=False)
    fig, ax = plt.subplots(figsize=(8, max(3, min(12, len(miss)/2))))
    miss.plot.bar(ax=ax)
    ax.set_ylabel("Fraction missing")
    ax.set_title("Missing values by column")
    path = save_fig(fig, "missing_values.png")
    return path

def plot_corr(df, numeric_cols):
    if len(numeric_cols) < 2:
        return None
    fig, ax = plt.subplots(figsize=(8,6))
    sns.heatmap(df[numeric_cols].corr(), annot=True, fmt=".2f", ax=ax, cmap="YlGnBu")
    ax.set_title("Correlation matrix (numeric)")
    path = save_fig(fig, "correlation_matrix.png")
    return path

def value_counts_plots(df):
    cats = df.select_dtypes(include=['object','category']).columns.tolist()
    paths = []
    for c in cats:
        fig, ax = plt.subplots(figsize=(6,4))
        vc = df[c].value_counts().nlargest(20)
        vc.plot.bar(ax=ax)
        ax.set_title(f"Value counts: {c}")
        paths.append(save_fig(fig, f"vc_{c}.png"))
    return paths

def feature_importance_plot(model, feature_names, prefix="feature_importance"):
    if hasattr(model, "feature_importances_"):
        imp = model.feature_importances_
    elif hasattr(model, "coef_"):
        imp = np.abs(model.coef_).ravel()
    else:
        return None
    idx = np.argsort(imp)[::-1][:30]
    names = [feature_names[i] for i in idx]
    vals = imp[idx]
    fig, ax = plt.subplots(figsize=(8, min(12, len(names)/1.5)))
    sns.barplot(x=vals, y=names, ax=ax)
    ax.set_title("Feature importance")
    path = save_fig(fig, f"{prefix}.png")
    return path


In [None]:
# Cell 3: Main automation function that accepts a dataframe and (optionally) target column
def run_automation(df, target_col=None, save_results=True, test_size=0.2, random_state=42):
    report = {}
    df0 = df.copy()
    stats = quick_stats(df0)
    report['summary_path'] = os.path.join(RESULTS_DIR, "data_summary.csv")

    # detect target if not provided
    if target_col is None:
        target_col = detect_target(df0)
    report['target_col'] = target_col
    y = df0[target_col]
    X = df0.drop(columns=[target_col])

    task = task_type_from_target(y)
    report['task'] = task

    # Basic EDA plots
    report['missing_plot'] = plot_missing(df0)
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    report['corr_plot'] = plot_corr(df0, numeric_cols)
    report['vc_plots'] = value_counts_plots(df0)

    # Preprocessing: simple imputer + encoding + scaling
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
    cat_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, numeric_cols),
        ('cat', cat_transformer, cat_cols)
    ], remainder='drop')

    # Choose models
    models = {}
    if task == 'classification':
        models = {
            'LogisticRegression': LogisticRegression(max_iter=500),
            'RandomForest': RandomForestClassifier(n_estimators=200, random_state=random_state),
            'GradientBoosting': GradientBoostingClassifier(n_estimators=150, random_state=random_state)
        }
        scoring = 'accuracy'
    else:
        models = {
            'LinearRegression': LinearRegression(),
            'RandomForest': RandomForestRegressor(n_estimators=200, random_state=random_state),
            'GradientBoosting': GradientBoostingRegressor(n_estimators=150, random_state=random_state)
        }
        scoring = 'neg_root_mean_squared_error'

    # Split
    if task == 'classification':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                            random_state=random_state, stratify=y.fillna(method='ffill'))
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                            random_state=random_state)

    results = {}
    feature_names_after_preproc = None

    for name, model in models.items():
        pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
        # cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state) if task=='classification' else KFold(n_splits=5, shuffle=True, random_state=random_state)
        try:
            scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
            mean_score = scores.mean()
        except Exception as e:
            mean_score = None
        # fit on full train set
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        # metrics
        if task == 'classification':
            acc = accuracy_score(y_test, preds)
            try:
                probas = pipe.predict_proba(X_test)[:,1]
                auc = roc_auc_score(y_test, probas)
            except:
                auc = None
            creport = classification_report(y_test, preds, output_dict=True)
            results[name] = {'cv_score': mean_score, 'accuracy': acc, 'auc': auc, 'report': creport}
        else:
            rmse = mean_squared_error(y_test, preds, squared=False)
            r2 = r2_score(y_test, preds)
            results[name] = {'cv_score': mean_score, 'rmse': rmse, 'r2': r2}

        # feature names extraction (after preprocessor)
        # get feature names if cat onehot used
        try:
            preproc = pipe.named_steps['preprocessor']
            num_cols = numeric_cols
            cat_ohe = []
            if len(cat_cols)>0:
                ohe = preproc.named_transformers_['cat'].named_steps['onehot']
                ohe_cols = ohe.get_feature_names_out(cat_cols)
                cat_ohe = list(ohe_cols)
            feature_names_after_preproc = list(num_cols) + cat_ohe
            # feature importance if available
            model_obj = pipe.named_steps['model']
            fi_path = feature_importance_plot(model_obj, feature_names_after_preproc, prefix=f"fi_{name}")
            if fi_path:
                results[name]['feature_importance_plot'] = fi_path
        except Exception as e:
            # ignore
            pass

        # save model
        joblib.dump(pipe, os.path.join(RESULTS_DIR, f"model_{name}.joblib"))

    # choose best
    if task == 'classification':
        best = max(results.items(), key=lambda kv: kv[1].get('accuracy', -999))
    else:
        # minimize rmse
        best = min(results.items(), key=lambda kv: kv[1].get('rmse', float('inf')))
    report['results'] = results
    report['best_model'] = best[0]
    # save results summary
    pd.Series(report).to_json(os.path.join(RESULTS_DIR, "run_report.json"))

    # write a human-readable conclusion
    with open(os.path.join(RESULTS_DIR, "conclusion.txt"), "w") as f:
        f.write("AUTO ANALYSIS SUMMARY\n")
        f.write(f"Detected target column: {target_col}\n")
        f.write(f"Detected task type: {task}\n")
        f.write(f"Models evaluated: {', '.join(results.keys())}\n")
        f.write(f"Best model: {best[0]}\n\n")
        f.write("Model metrics summary:\n")
        for m, r in results.items():
            f.write(f"\n--- {m} ---\n")
            for k,v in r.items():
                f.write(f"{k}: {v}\n")
    return report


In [None]:
# Cell 4: Upload or load file
from google.colab import files
print("Option A: upload file from your computer (small files).")
print("Option B: mount Google Drive and give path (for larger files).")

# Try to upload
uploaded = files.upload()
if len(uploaded) == 0:
    # fallback: ask user to mount drive manually
    print("No files uploaded â€” please mount Google Drive and set file path manually.")
else:
    # take the first uploaded file
    fname = list(uploaded.keys())[0]
    print("Uploaded:", fname)
    ext = fname.split('.')[-1].lower()
    if ext in ['csv','txt']:
        df = pd.read_csv(fname)
    elif ext in ['xlsx','xls']:
        df = pd.read_excel(fname)
    else:
        # try CSV read
        try:
            df = pd.read_csv(fname)
        except Exception as e:
            raise RuntimeError("Could not read the uploaded file. Please upload a CSV/XLSX.")
    print("Data loaded. Shape:", df.shape)
    display(df.head())


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Cell 5: Run automation on the loaded dataframe
# If you want to force a target column, set target_col = "YourColumnName"
target_col = None  # <-- change if you want to force target column name (string)
report = run_automation(df, target_col=target_col)
print("Automation finished. Results saved to:", RESULTS_DIR)
print("Detected task:", report['task'])
print("Detected target column:", report['target_col'])
print("Best model:", report['best_model'])


In [None]:
# Cell 6: show main files and create zip for download
!ls -lah /content/project_results || true

# Create a zip for easy download
zipf = "/content/project_results.zip"
with zipfile.ZipFile(zipf, 'w', zipfile.ZIP_DEFLATED) as z:
    for root, dirs, files in os.walk(RESULTS_DIR):
        for file in files:
            z.write(os.path.join(root, file), arcname=os.path.join(os.path.relpath(root, RESULTS_DIR), file))

print("Results zipped:", zipf)
from google.colab import files
files.download(zipf)


total 8.0K
drwxr-xr-x 2 root root 4.0K Nov  7 05:28 .
drwxr-xr-x 1 root root 4.0K Nov  7 05:30 ..
Results zipped: /content/project_results.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>