**1) Install and upload your dataset**

In [1]:
!pip -q install gradio openpyxl

from google.colab import files
uploaded = files.upload()  # upload your movies_metadata.xlsx or .csv
uploaded_names = list(uploaded.keys())
print("Uploaded:", uploaded_names)

# Use the first uploaded file as dataset
DATA_PATH = f"/content/{uploaded_names[0]}"
print("DATA_PATH:", DATA_PATH)

Saving movies_metadata.xlsx to movies_metadata.xlsx
Uploaded: ['movies_metadata.xlsx']
DATA_PATH: /content/movies_metadata.xlsx


2) **Defination of full pipeline (preprocessing + model + train/load + predict)**

In [2]:
import os, math, ast
from typing import Dict, Any, Optional, List
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

MODEL_PATH = "/content/movie_rating_model.pt"

def safe_parse_date(d):
    try:
        return pd.to_datetime(d)
    except Exception:
        return pd.NaT

def extract_first_genre(g):
    if pd.isna(g):
        return "Unknown"
    if isinstance(g, list):
        if len(g) > 0:
            first = g[0]
            if isinstance(first, dict) and 'name' in first:
                return str(first['name'])
        return "Unknown"
    if isinstance(g, str):
        s = g.strip()
        if s.startswith('[') and s.endswith(']'):
            try:
                items = ast.literal_eval(s)
                if isinstance(items, list) and len(items) > 0:
                    first = items[0]
                    if isinstance(first, dict) and 'name' in first:
                        return str(first['name'])
            except Exception:
                pass
        if '"name"' in s or "'name'" in s:
            import re
            m = re.search(r'"name"\s*:\s*"([^"]+)"', s)
            if not m:
                m = re.search(r"'name'\s*:\s*'([^']+)'", s)
            if m:
                return m.group(1)
        parts = s.split('|')
        return parts[0].strip() if parts else "Unknown"
    return "Unknown"

def clip_rating(x: float) -> float:
    return max(0.0, min(10.0, float(x)))

def float_or_default(s, default):
    s = str(s).strip()
    if s == "" or s.lower() == "na":
        return float(default)
    try:
        return float(s)
    except:
        return float(default)

class MoviePreprocessor:
    def __init__(self):
        self.numeric_feats = [
            "budget", "revenue", "runtime", "popularity",
            "release_year", "release_month", "release_dayofweek"
        ]
        self.categorical_feature_name = "main_genre_top"
        try:
            self.ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
            self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
        self.scaler = StandardScaler()

        self.top_genres: Optional[List[str]] = None
        self.genre_cols: Optional[List[str]] = None
        self.numeric_medians: Optional[Dict[str, float]] = None
        self.ohe_categories_: Optional[List[List[str]]] = None

    def _prepare_work_df(self, df: pd.DataFrame) -> pd.DataFrame:
        work = pd.DataFrame()
        for c in ["budget", "revenue", "runtime", "popularity"]:
            work[c] = pd.to_numeric(df[c], errors="coerce") if c in df.columns else np.nan

        if "genres" in df.columns:
            work["main_genre"] = df["genres"].apply(extract_first_genre)
        else:
            work["main_genre"] = "Unknown"

        if "release_date" in df.columns:
            parsed = df["release_date"].apply(safe_parse_date)
        else:
            parsed = pd.to_datetime(pd.Series([pd.NaT] * len(df)))
        work["release_date_parsed"] = parsed
        work["release_year"] = work["release_date_parsed"].dt.year.fillna(0).astype(int)
        work["release_month"] = work["release_date_parsed"].dt.month.fillna(0).astype(int)
        work["release_dayofweek"] = work["release_date_parsed"].dt.dayofweek.fillna(-1).astype(int)
        return work

    def fit(self, df: pd.DataFrame, y_col: str = "vote_average"):
        work = self._prepare_work_df(df)
        self.numeric_medians = {}
        for c in ["budget", "revenue", "runtime", "popularity"]:
            med = work[c].median()
            work[c] = work[c].fillna(med)
            self.numeric_medians[c] = float(med)

        self.top_genres = work["main_genre"].value_counts().nlargest(10).index.tolist()
        work[self.categorical_feature_name] = work["main_genre"].apply(
            lambda x: x if x in self.top_genres else "Other"
        )

        genre_arr = self.ohe.fit_transform(work[[self.categorical_feature_name]])
        self.ohe_categories_ = [list(c) for c in self.ohe.categories_]
        self.genre_cols = [f"genre_{c}" for c in self.ohe_categories_[0]]

        self.scaler.fit(work[self.numeric_feats])

    def transform_df(self, df: pd.DataFrame) -> pd.DataFrame:
        if self.top_genres is None or self.genre_cols is None or self.numeric_medians is None:
            raise RuntimeError("Preprocessor not fitted.")
        work = self._prepare_work_df(df)
        for c in ["budget", "revenue", "runtime", "popularity"]:
            med = self.numeric_medians[c]
            work[c] = work[c].fillna(med)
        work[self.categorical_feature_name] = work["main_genre"].apply(
            lambda x: x if x in self.top_genres else "Other"
        )
        genre_arr = self.ohe.transform(work[[self.categorical_feature_name]])
        X_num_scaled = self.scaler.transform(work[self.numeric_feats])
        X = pd.DataFrame(X_num_scaled, columns=self.numeric_feats, index=work.index)
        genre_df = pd.DataFrame(genre_arr, columns=self.genre_cols, index=work.index)
        X = pd.concat([X, genre_df], axis=1)
        return X

    def transform_single_input(self, features: Dict[str, Any]) -> pd.DataFrame:
        budget = features.get("budget", np.nan)
        revenue = features.get("revenue", np.nan)
        runtime = features.get("runtime", np.nan)
        popularity = features.get("popularity", np.nan)
        genres_raw = features.get("main_genre", features.get("genres", "Unknown"))
        genres_str = genres_raw if isinstance(genres_raw, str) else str(genres_raw)
        release_date = features.get("release_date", "")
        raw = pd.DataFrame({
            "budget": [budget],
            "revenue": [revenue],
            "runtime": [runtime],
            "popularity": [popularity],
            "genres": [genres_str],
            "release_date": [release_date],
        })
        return self.transform_df(raw)

    def get_state(self) -> Dict[str, Any]:
        return {
            "numeric_feats": self.numeric_feats,
            "categorical_feature_name": self.categorical_feature_name,
            "top_genres": self.top_genres,
            "genre_cols": self.genre_cols,
            "numeric_medians": self.numeric_medians,
            "ohe_categories": self.ohe_categories_,
            "scaler_mean": getattr(self.scaler, "mean_", None),
            "scaler_var": getattr(self.scaler, "var_", None),
        }

    def load_state(self, state: Dict[str, Any]):
        self.numeric_feats = state["numeric_feats"]
        self.categorical_feature_name = state["categorical_feature_name"]
        self.top_genres = state["top_genres"]
        self.genre_cols = state["genre_cols"]
        self.numeric_medians = state["numeric_medians"]
        self.ohe_categories_ = state["ohe_categories"]
        try:
            self.ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False, categories=self.ohe_categories_)
        except TypeError:
            self.ohe = OneHotEncoder(handle_unknown="ignore", sparse=False, categories=self.ohe_categories_)
        dummy = pd.DataFrame({self.categorical_feature_name: [self.ohe_categories_[0][0]]})
        self.ohe.fit(dummy[[self.categorical_feature_name]])
        self.ohe.categories_ = tuple([np.array(self.ohe_categories_[0], dtype=object)])

        self.scaler = StandardScaler()
        if state["scaler_mean"] is not None and state["scaler_var"] is not None:
            self.scaler.mean_ = np.array(state["scaler_mean"], dtype=float)
            self.scaler.var_ = np.array(state["scaler_var"], dtype=float)
            self.scaler.scale_ = np.sqrt(self.scaler.var_)
            self.scaler.n_features_in_ = len(self.numeric_feats)

class MovieRegressor(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 128, dropout: float = 0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1),
        )
    def forward(self, x):
        return self.net(x)

def load_dataset(path: str) -> pd.DataFrame:
    if path.lower().endswith(".xlsx"):
        try:
            return pd.read_excel(path, engine="openpyxl")
        except Exception:
            return pd.read_excel(path)
    elif path.lower().endswith(".csv"):
        return pd.read_csv(path)
    else:
        raise ValueError("Unsupported dataset format. Upload .xlsx or .csv")

def train_torch_model(X_train: np.ndarray, y_train: np.ndarray,
                      X_val: np.ndarray, y_val: np.ndarray,
                      input_dim: int, epochs: int = 20, batch_size: int = 64, lr: float = 1e-3,
                      device: Optional[torch.device] = None):
    device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MovieRegressor(input_dim=input_dim, hidden_dim=128, dropout=0.2).to(device)
    train_ds = TensorDataset(torch.from_numpy(X_train.astype(np.float32)),
                             torch.from_numpy(y_train.astype(np.float32).reshape(-1,1)))
    val_ds = TensorDataset(torch.from_numpy(X_val.astype(np.float32)),
                           torch.from_numpy(y_val.astype(np.float32).reshape(-1,1)))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val = float("inf")
    best_state = None
    for epoch in range(1, epochs+1):
        model.train()
        tr_sum = 0.0; ntr = 0
        for xb, yb in train_loader:
            xb = xb.to(device); yb = yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            tr_sum += loss.item() * xb.size(0); ntr += xb.size(0)
        tr_loss = tr_sum / max(1, ntr)

        model.eval()
        vl_sum = 0.0; nvl = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device); yb = yb.to(device)
                preds = model(xb)
                loss = criterion(preds, yb)
                vl_sum += loss.item() * xb.size(0); nvl += xb.size(0)
        vl_loss = vl_sum / max(1, nvl)

        if vl_loss < best_val:
            best_val = vl_loss
            best_state = model.state_dict()

        if epoch == 1 or epoch % 5 == 0 or epoch == epochs:
            print(f"Epoch {epoch}/{epochs} - Train Loss: {tr_loss:.4f} - Val Loss: {vl_loss:.4f}")

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, device

def evaluate_model(model, X, y, device):
    model.eval()
    with torch.no_grad():
        xb = torch.from_numpy(X.astype(np.float32)).to(device)
        preds = model(xb).cpu().numpy().ravel()
    y_true = y.astype(float).ravel()
    mse = mean_squared_error(y_true, preds)
    mae = mean_absolute_error(y_true, preds)
    rmse = math.sqrt(mse)
    r2 = r2_score(y_true, preds)
    return dict(mse=mse, mae=mae, rmse=rmse, r2=r2)

def save_artifacts(path, model, preproc, input_dim: int, hidden_dim=128, dropout=0.2):
    artifact = {
        "model_state_dict": model.state_dict(),
        "input_dim": input_dim,
        "hidden_dim": hidden_dim,
        "dropout": dropout,
        "preproc_state": preproc.get_state(),
    }
    torch.save(artifact, path)
    print("Saved model to", path)

def load_artifacts(path, device=None):
    device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
    art = torch.load(path, map_location=device)
    model = MovieRegressor(art["input_dim"], art.get("hidden_dim", 128), art.get("dropout", 0.2)).to(device)
    model.load_state_dict(art["model_state_dict"])
    model.eval()
    preproc = MoviePreprocessor()
    preproc.load_state(art["preproc_state"])
    return model, preproc, device

def train_from_dataset(data_path: str, epochs=20, batch_size=64, lr=1e-3):
    print("Loading dataset:", data_path)
    df = load_dataset(data_path)
    y_col = "vote_average"
    if y_col not in df.columns:
        alts = [c for c in df.columns if "vote" in c.lower() or "rating" in c.lower()]
        if not alts:
            raise ValueError("Target 'vote_average' not found in dataset.")
        y_col = alts[0]
        print("Using alternative target column:", y_col)

    preproc = MoviePreprocessor()
    preproc.fit(df, y_col=y_col)
    X_all = preproc.transform_df(df)
    y_all = pd.to_numeric(df[y_col], errors="coerce").fillna(df[y_col].median()).values.astype(np.float32)

    X_train, X_temp, y_train, y_temp = train_test_split(X_all, y_all, test_size=0.30, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

    model, device = train_torch_model(X_train.values, y_train, X_val.values, y_val,
                                      input_dim=X_all.shape[1], epochs=epochs, batch_size=batch_size, lr=lr)
    metrics = evaluate_model(model, X_test.values, y_test, device)
    print(f"Test -> MSE: {metrics['mse']:.4f}  MAE: {metrics['mae']:.4f}  RMSE: {metrics['rmse']:.4f}  R2: {metrics['r2']:.4f}")
    save_artifacts(MODEL_PATH, model, preproc, input_dim=X_all.shape[1])
    return model, preproc, device

def predict_single(model, preproc, device, features: Dict[str, Any]) -> float:
    Xdf = preproc.transform_single_input(features)
    x = torch.from_numpy(Xdf.values.astype(np.float32)).to(device)
    with torch.no_grad():
        pred = model(x).cpu().numpy().ravel()[0]
    return clip_rating(pred)

# Convenience function for a one-line "grade"
def predict_from_values(budget, revenue, runtime, popularity, main_genre, release_date):
    features = {
        "budget": float_or_default(budget, 0.0),
        "revenue": float_or_default(revenue, 0.0),
        "runtime": float_or_default(runtime, 100.0),
        "popularity": float_or_default(popularity, 10.0),
        "main_genre": str(main_genre),
        "release_date": str(release_date),
    }
    rating = predict_single(model, preproc, device, features)
    print(f"Predicted rating: {rating:.2f} / 10.00")

# Try to load an existing model, else train
if os.path.exists(MODEL_PATH):
    try:
        model, preproc, device = load_artifacts(MODEL_PATH)
        print("Loaded existing model from", MODEL_PATH)
    except Exception as e:
        print("Failed to load model; training instead:", e)
        model, preproc, device = train_from_dataset(DATA_PATH, epochs=20, batch_size=64, lr=1e-3)
else:
    model, preproc, device = train_from_dataset(DATA_PATH, epochs=20, batch_size=64, lr=1e-3)

# Show known genres (optional)
if preproc.ohe_categories_:
    print("Genres recognized:", preproc.ohe_categories_[0])

Loading dataset: /content/movies_metadata.xlsx
Epoch 1/20 - Train Loss: 13.1716 - Val Loss: 3.3259
Epoch 5/20 - Train Loss: 3.3791 - Val Loss: 2.9325
Epoch 10/20 - Train Loss: 3.1002 - Val Loss: 2.9248
Epoch 15/20 - Train Loss: 2.9432 - Val Loss: 2.7928
Epoch 20/20 - Train Loss: 3.0046 - Val Loss: 2.8438
Test -> MSE: 2.8919  MAE: 1.1685  RMSE: 1.7006  R2: 0.2137
Saved model to /content/movie_rating_model.pt
Genres recognized: ['Action', 'Adventure', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Horror', 'Other', 'Romance', 'Thriller', '[]']


2) **Exactly what to enter to receive a “grade”. Immediately you change the features (genre/date... as needed) and Run, you will receive a grade.**

In [5]:
predict_from_values(150000000, 500000000, 120, 50, "Documentary", "2019-07-16")

Predicted rating: 7.09 / 10.00


In [6]:
predict_from_values(150000000, 500000000, 120, 50, "Action", "2019-07-16")

Predicted rating: 6.84 / 10.00


**3) Buttons/sliders/dropdown UI(User Interface)**

What to click:

Adjust values with the sliders/inputs
Click “Predict”

The predicted rating appears in the output box

In [7]:
import gradio as gr

genre_options = preproc.ohe_categories_[0] if preproc.ohe_categories_ else (preproc.top_genres or []) + ["Other"]
genre_options = list(dict.fromkeys(genre_options))  # dedupe, keep order

def predict_ui(budget, revenue, runtime, popularity, main_genre, release_date):
    features = {
        "budget": budget,
        "revenue": revenue,
        "runtime": runtime,
        "popularity": popularity,
        "main_genre": main_genre,
        "release_date": release_date,
    }
    rating = predict_single(model, preproc, device, features)
    return f"{rating:.2f}"

with gr.Blocks() as demo:
    gr.Markdown("## Movie Rating Agent")
    with gr.Row():
        budget_in = gr.Number(label="Budget (USD)", value=float(preproc.numeric_medians.get("budget", 0.0)))
        revenue_in = gr.Number(label="Revenue (USD)", value=float(preproc.numeric_medians.get("revenue", 0.0)))
    with gr.Row():
        runtime_in = gr.Slider(40, 240, value=float(preproc.numeric_medians.get("runtime", 100.0)), step=1, label="Runtime (minutes)")
        popularity_in = gr.Slider(0, 200, value=float(preproc.numeric_medians.get("popularity", 10.0)), step=0.5, label="Popularity")
    with gr.Row():
        genre_in = gr.Dropdown(choices=genre_options, value=( "Drama" if "Drama" in genre_options else genre_options[0] ),
                               label="Main Genre")
        date_in = gr.Textbox(value="2019-07-16", label="Release Date (YYYY-MM-DD)")
    predict_btn = gr.Button("Predict")
    out = gr.Textbox(label="Predicted rating (0–10)")

    predict_btn.click(predict_ui, inputs=[budget_in, revenue_in, runtime_in, popularity_in, genre_in, date_in], outputs=[out])

    gr.Markdown("Try an example:")
    gr.Examples(
        examples=[
            [150000000, 500000000, 120, 50, "Action", "2019-07-16"],
            [30000000, 80000000, 95, 18, "Comedy", "2014-05-10"],
            [5000000, 12000000, 105, 12, "Drama", "2021-11-05"],
        ],
        inputs=[budget_in, revenue_in, runtime_in, popularity_in, genre_in, date_in],
        outputs=[out],
        fn=predict_ui
    )

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e9f0a0560f832dc35c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


