<a href="https://colab.research.google.com/github/LoopMint/angelatancapstone-app/blob/main/CP_Read_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Use Gradio to create web apps that displays csv data onto an interactive user interface**

In [5]:
!pip install --upgrade gradio



# **Use ReportLab library to download into PDF**

In [1]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.4.2-py3-none-any.whl.metadata (1.8 kB)
Downloading reportlab-4.4.2-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/2.0 MB[0m [31m13.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.2


# **Prompt user to upload the QS Ranking CSV File**

In [3]:
from google.colab import files

uploaded = files.upload()   #triggers a file picker dialog in browser

# Get the uploaded file name dynamically
for fn in uploaded.keys():
    file_path = fn
    print(f"File uploaded: {file_path}")


Saving QSRanking.csv to QSRanking.csv
File uploaded: QSRanking.csv


# **Import libraries**

*   pandas as pd → for loading CSV
*   gradio as gr → for building the interactive web app & dashboard.
*   tensorflow / keras → for building and training deep learning models (classification, regression, Siamese).
*   numpy as np → for numerical operations.
*   sklearn.preprocessing.StandardScaler → scales your features so they have mean 0 and std 1
*   sklearn.model_selection.train_test_split → splits data into training + testing validation sets.
*   matplotlib.pyplot as plt → creates charts and plots.
*   io.BytesIO + base64 → converts plots/images so they can be displayed in Gradio.
*   reportlab → generates PDF reports (eg. for pairwise ranking analysis).
*   tempfile → creates temporary files for storing charts and PDFs before download.
*   os → interact with operating system

In [1]:
import pandas as pd
import gradio as gr
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from io import BytesIO
import base64
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import os
import tempfile

# --- Load dataset ---
file_path = '/content/QSRanking.csv'

try:
    df = pd.read_csv(file_path, encoding='latin1')

    print(f"✅ Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns.")

   #drop old records that doesn't require analysis
   if 'RANK_2024' in df.columns:
        df = df.drop(columns=['RANK_2024'])
        print("✅ Dropped column 'RANK_2024'.")
except FileNotFoundError:
    print(f"❌ Error: File '{file_path}' not found.")
    df = None

if df is not None:
    if 'RANK_2025' in df.columns:
        df['RANK_2025'] = pd.to_numeric(df['RANK_2025'], errors='coerce').fillna(1402)
        df['Top100'] = np.where(df['RANK_2025'] <= 100, 1, 0)
    else:
        raise ValueError("'RANK_2025' column is required.")

    feature_cols = ['Academic_Reputation_Score', 'Employer_Reputation_Score',
                    'Citations_per_Faculty_Score', 'Faculty_Student_Score',
                    'International_Faculty_Score']
    df = df.dropna(subset=feature_cols + ['Institution_Name', 'Top100'])
    print(f"✅ Cleaned dataset shape: {df.shape}")

    X = df[feature_cols].values
    y_class = df['Top100'].values
    y_reg = df['RANK_2025'].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # --- Classification model ---
    X_train, X_val, y_train_c, y_val_c = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)
    model_class = tf.keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model_class.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model_class.fit(X_train, y_train_c, epochs=20, validation_data=(X_val, y_val_c), verbose=1)

    # --- Regression model ---
    _, _, y_train_r, y_val_r = train_test_split(X_scaled, y_reg, test_size=0.2, random_state=42)
    model_reg = tf.keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    model_reg.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    model_reg.fit(X_train, y_train_r, epochs=20, validation_data=(X_val, y_val_r), verbose=1)

    # --- Siamese model ---
    def build_siamese_model(input_dim):
        base = tf.keras.Sequential([
            layers.Dense(64, activation='relu', input_shape=(input_dim,)),
            layers.Dense(32, activation='relu')
        ])
        input_a = Input(shape=(input_dim,))
        input_b = Input(shape=(input_dim,))
        encoded_a = base(input_a)
        encoded_b = base(input_b)
        diff = layers.Subtract()([encoded_a, encoded_b])
        out = layers.Dense(1, activation='sigmoid')(diff)
        return Model([input_a, input_b], out)

    model_rank = build_siamese_model(X.shape[1])
    model_rank.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    def create_pairs(X, y_rank):
        pairs_a, pairs_b, labels = [], [], []
        for i in range(len(X)):
            for j in range(i + 1, len(X)):
                pairs_a.append(X[i])
                pairs_b.append(X[j])
                labels.append(1 if y_rank[i] < y_rank[j] else 0)
        return np.array(pairs_a), np.array(pairs_b), np.array(labels)

    X_a, X_b, y_rank_pair = create_pairs(X_scaled, y_reg)
    model_rank.fit([X_a, X_b], y_rank_pair, epochs=5, batch_size=256, verbose=1)

    # --- Functions ---
    def run_model(univ_input, mode):
        matched = df[df['Institution_Name'].str.lower().str.contains(univ_input.lower())]
        if matched.empty:
            return f"❌ No university found matching '{univ_input}'."
        inst_name = matched.iloc[0]['Institution_Name']
        rank = matched.iloc[0]['RANK_2025']
        X_input = scaler.transform(matched.iloc[0][feature_cols].values.reshape(1, -1))

        if mode == "Classification":
            prob = model_class.predict(X_input)[0][0] * 100
            expected = "HIGH" if rank <= 100 else "LOW"
            return (
                f"✅ Classification Prediction for {inst_name}\n"
                f"Actual RANK_2025: {rank} (Expected: {expected})\n"
                f"Predicted Top 100 Probability: {prob:.1f}%"
            )
        else:
            pred_rank = model_reg.predict(X_input)[0][0]
            max_rank = 1402
            percentage = (pred_rank / max_rank) * 100
            return (
                f"✅ Regression Prediction for {inst_name}\n"
                f"Actual RANK_2025: {rank}\n"
                f"Predicted RANK_2025: {pred_rank:.1f} ({percentage:.1f}% of max rank)"
            )

    def run_siamese_model(univ_a, univ_b):
        u1 = df[df['Institution_Name'].str.lower().str.contains(univ_a.lower())]
        u2 = df[df['Institution_Name'].str.lower().str.contains(univ_b.lower())]
        if u1.empty or u2.empty:
            return "❌ One or both universities not found.", None

        X1 = scaler.transform(u1.iloc[0][feature_cols].values.reshape(1, -1))
        X2 = scaler.transform(u2.iloc[0][feature_cols].values.reshape(1, -1))

        prob = model_rank.predict([X1, X2])[0][0] * 100
        better = u1.iloc[0]['Institution_Name'] if prob >= 50 else u2.iloc[0]['Institution_Name']

        # Chart
        vals1 = u1.iloc[0][feature_cols].values
        vals2 = u2.iloc[0][feature_cols].values
        x = np.arange(len(feature_cols))

        fig, ax = plt.subplots(figsize=(10, 6))
        ax.barh(x - 0.2, vals1, height=0.4, label=u1.iloc[0]['Institution_Name'])
        ax.barh(x + 0.2, vals2, height=0.4, label=u2.iloc[0]['Institution_Name'])
        ax.set_yticks(x)
        ax.set_yticklabels(feature_cols)
        ax.set_xlabel("Score")
        ax.set_title("Feature Comparison")
        ax.legend()

        buf = BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img_b64 = base64.b64encode(buf.read()).decode('utf-8')
        plt.close(fig)

        # Save PDF
        os.makedirs("/content/reports", exist_ok=True)
        pdf_path = "/content/reports/ranking_report.pdf"
        c = canvas.Canvas(pdf_path, pagesize=letter)
        c.setFont("Helvetica", 12)
        text = c.beginText(40, 750)
        text.textLine("Pairwise Ranking Prediction")
        text.textLine(f"Predicted better ranked: {better}")
        text.textLine(f"Probability University A better: {prob:.1f}%")
        c.drawText(text)

        tmp_img_path = "/content/reports/tmp_chart.png"
        with open(tmp_img_path, 'wb') as f:
            f.write(base64.b64decode(img_b64))
        c.drawImage(tmp_img_path, 40, 300, width=500, preserveAspectRatio=True)
        c.save()
        os.remove(tmp_img_path)

        return (
            f"Predicted better ranked: {better}\n"
            f"Probability University A better: {prob:.1f}%\n"
            f"✅ PDF saved at: {pdf_path}\n"
            f"👉 Download via Colab file browser.",
            f"<img src='data:image/png;base64,{img_b64}'/>"
        )

    def show_top10_chart():
        top10 = df.nsmallest(10, 'RANK_2025')
        fig, ax = plt.subplots(figsize=(10, 8))
        ax.barh(top10['Institution_Name'], top10['RANK_2025'], color='blue')
        ax.set_xlabel("RANK_2025")
        ax.set_title("Top 10 Universities by RANK_2025")
        ax.invert_yaxis()

        buf = BytesIO()
        fig.savefig(buf, format="png")
        buf.seek(0)
        img_b64 = base64.b64encode(buf.read()).decode('utf-8')
        plt.close(fig)
        return f"<img src='data:image/png;base64,{img_b64}'/>"

    # --- Gradio app ---
    with gr.Blocks() as demo:
        gr.Markdown("## 🌟 QS Ranking AI Dashboard")
        gr.Dataframe(value=df, interactive=False)

        gr.Markdown("### Classification / Regression")
        univ_input = gr.Textbox(label="University name")
        model_choice = gr.Dropdown(choices=["Classification", "Regression"])
        out1 = gr.Textbox(label="Result")
        gr.Button("Run Model").click(run_model, inputs=[univ_input, model_choice], outputs=out1)

        gr.Markdown("### Pairwise Ranking + PDF")
        u1_input = gr.Textbox(label="University A")
        u2_input = gr.Textbox(label="University B")
        out2 = gr.Textbox(label="Ranking Result")
        out3 = gr.HTML()
        gr.Button("Run Pairwise + Save PDF").click(run_siamese_model, inputs=[u1_input, u2_input], outputs=[out2, out3])

        gr.Markdown("### Top 10 Universities Chart")
        out_chart = gr.HTML()
        gr.Button("Show Top 10 Chart").click(show_top10_chart, outputs=out_chart)

    demo.launch(share=True)

else:
    print("❌ Data not loaded. App will not start.")


ModuleNotFoundError: No module named 'reportlab'