<a href="https://colab.research.google.com/github/LoopMint/angelatancapstone-app/blob/main/CP_Read_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#Use Gradio app to display data in nice table
!pip install gradio



In [14]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.4.2-py3-none-any.whl.metadata (1.8 kB)
Downloading reportlab-4.4.2-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m66.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.2


In [5]:
!pip install google-generativeai



In [6]:
from google.colab import files

uploaded = files.upload()

# Get the uploaded file name dynamically
for fn in uploaded.keys():
    file_path = fn
    print(f"File uploaded: {file_path}")


Saving QSRanking.csv to QSRanking.csv
File uploaded: QSRanking.csv


In [15]:
!pip install reportlab

import pandas as pd
import gradio as gr
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from io import BytesIO
import base64
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import tempfile
import os

# --- Load dataset ---
file_path = '/content/QSRanking.csv'

try:
    df = pd.read_csv(file_path, encoding='latin1')
    print(f"✅ Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns.")
    if 'RANK_2024' in df.columns:
        df = df.drop(columns=['RANK_2024'])
        print("✅ Dropped column 'RANK_2024'.")
except FileNotFoundError:
    print(f"❌ Error: File '{file_path}' not found.")
    df = None

if df is not None:
    if 'RANK_2025' in df.columns:
        df['RANK_2025'] = pd.to_numeric(df['RANK_2025'], errors='coerce').fillna(1402)
        df['Top100'] = np.where(df['RANK_2025'] <= 100, 1, 0)
    else:
        raise ValueError("'RANK_2025' column is required.")

    feature_cols = ['Academic_Reputation_Score', 'Employer_Reputation_Score',
                    'Citations_per_Faculty_Score', 'Faculty_Student_Score',
                    'International_Faculty_Score']
    df = df.dropna(subset=feature_cols + ['Institution_Name', 'Top100'])
    print(f"✅ Dropped NaNs. Final shape: {df.shape}")

    X = df[feature_cols].values
    y_reg = df['RANK_2025'].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # --- Siamese Ranking Model ---
    def build_siamese_model(input_dim):
        base = tf.keras.Sequential([
            layers.Dense(64, activation='relu', input_shape=(input_dim,)),
            layers.Dense(32, activation='relu')
        ])
        input_a = Input(shape=(input_dim,))
        input_b = Input(shape=(input_dim,))
        encoded_a = base(input_a)
        encoded_b = base(input_b)
        diff = layers.Subtract()([encoded_a, encoded_b])
        out = layers.Dense(1, activation='sigmoid')(diff)
        return Model([input_a, input_b], out)

    model_rank = build_siamese_model(X.shape[1])
    model_rank.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    def create_pairs(X, y_rank):
        pairs_a, pairs_b, labels = [], [], []
        for i in range(len(X)):
            for j in range(i + 1, len(X)):
                pairs_a.append(X[i])
                pairs_b.append(X[j])
                labels.append(1 if y_rank[i] < y_rank[j] else 0)
        return np.array(pairs_a), np.array(pairs_b), np.array(labels)

    X_a, X_b, y_rank_pair = create_pairs(X_scaled, y_reg)
    model_rank.fit([X_a, X_b], y_rank_pair, epochs=10, batch_size=256, verbose=1)

    # --- Run model + generate PDF ---
    def run_siamese_model(univ_a, univ_b):
        if not univ_a or not univ_b:
            return "❌ Please provide both University A and University B.", None
        u1 = df[df['Institution_Name'].str.lower().str.contains(univ_a.lower())]
        u2 = df[df['Institution_Name'].str.lower().str.contains(univ_b.lower())]
        if u1.empty or u2.empty:
            return "❌ One or both universities not found.", None

        X1 = scaler.transform(u1.iloc[0][feature_cols].values.reshape(1, -1))
        X2 = scaler.transform(u2.iloc[0][feature_cols].values.reshape(1, -1))

        prob = model_rank.predict([X1, X2])[0][0]
        better = u1.iloc[0]['Institution_Name'] if prob >= 0.5 else u2.iloc[0]['Institution_Name']

        vals1 = u1.iloc[0][feature_cols].values
        vals2 = u2.iloc[0][feature_cols].values
        diffs = np.abs(vals1 - vals2)
        top_features = [feature_cols[i] for i in np.argsort(diffs)[-3:][::-1]]

        # Plot
        x = np.arange(len(feature_cols))
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.barh(x - 0.2, vals1, height=0.4, label=u1.iloc[0]['Institution_Name'])
        ax.barh(x + 0.2, vals2, height=0.4, label=u2.iloc[0]['Institution_Name'])
        ax.set_yticks(x)
        ax.set_yticklabels(feature_cols)
        ax.set_xlabel("Score")
        ax.set_title("Feature Comparison")
        ax.legend()

        buf = BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img_b64 = base64.b64encode(buf.read()).decode('utf-8')
        plt.close(fig)

        # Save chart image
        tmp_img = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
        with open(tmp_img.name, 'wb') as f:
            f.write(base64.b64decode(img_b64))

        # Create PDF
        tmp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
        c = canvas.Canvas(tmp_pdf.name, pagesize=letter)
        width, height = letter
        c.setFont("Helvetica", 12)
        text = c.beginText(40, height - 40)

        result_text = (
            f"✅ Pairwise Ranking Analysis\n\n"
            f"The model predicts {better} is better ranked between:\n"
            f"- University A: {u1.iloc[0]['Institution_Name']}\n"
            f"- University B: {u2.iloc[0]['Institution_Name']}\n\n"
            f"Probability University A better: {prob:.2f}\n\n"
            f"The largest feature differences were: {', '.join(top_features)}"
        )

        for line in result_text.split('\n'):
            text.textLine(line)
        c.drawText(text)
        c.drawImage(tmp_img.name, 40, 100, width=500, preserveAspectRatio=True)
        c.save()

        download_html = f"<a href='file/{tmp_pdf.name}' target='_blank' download='ranking_report.pdf'>📄 Download PDF Report</a>"

        return result_text, f"<img src='data:image/png;base64,{img_b64}'/><br>{download_html}"

    # --- Gradio app ---
    with gr.Blocks(theme=gr.themes.Default()) as demo:
        gr.Markdown("## 🌟 QS Ranking AI Dashboard")
        gr.Dataframe(value=df, interactive=False)

        gr.Markdown("### 🏆 Pairwise Ranking + PDF Report")
        univ_input_a = gr.Textbox(label="Enter first university name (University A)")
        univ_input_b = gr.Textbox(label="Enter second university name (University B)")
        rank_btn = gr.Button("Run Pairwise Ranking + Download Report")
        rank_output = gr.Textbox(label="Ranking Result (Long-form Analysis)", lines=10)
        rank_plot = gr.HTML()
        rank_btn.click(fn=run_siamese_model, inputs=[univ_input_a, univ_input_b], outputs=[rank_output, rank_plot])

    demo.launch(share=True)

else:
    print("❌ Data not loaded. App will not start.")


✅ Loaded dataset with 1503 rows and 28 columns.
✅ Dropped column 'RANK_2024'.
✅ Dropped NaNs. Final shape: (1403, 28)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m3842/3842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9047 - loss: 0.2299
Epoch 2/10
[1m3842/3842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 0.9606 - loss: 0.1303
Epoch 3/10
[1m3842/3842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9698 - loss: 0.1106
Epoch 4/10
[1m3842/3842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9750 - loss: 0.0975
Epoch 5/10
[1m3842/3842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 0.9772 - loss: 0.0907
Epoch 6/10
[1m3842/3842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9790 - loss: 0.0850
Epoch 7/10
[1m3842/3842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9805 - loss: 0.0799
Epoch 8/10
[1m3842/3842[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9816 - loss: 0.0758
Epoch 9/10
[1m3