<a href="https://colab.research.google.com/github/LoopMint/angelatancapstone-app/blob/main/CP_Read_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Use Gradio to create web apps that displays csv data onto an interactive user interface**

In [16]:
#use the latest version
!pip install --upgrade gradio



# **Use ReportLab library to download into PDF**

In [17]:
!pip install reportlab



# **Prompt user to upload the QS Ranking CSV File**

In [18]:
from google.colab import files

#triggers a file picker dialog in browser
uploaded = files.upload()

# Get the uploaded file name dynamically
for fn in uploaded.keys():
    file_path = fn
    print(f"File uploaded: {file_path}")


Saving QSRanking.csv to QSRanking (1).csv
File uploaded: QSRanking (1).csv


# **Import libraries**

*   pandas as pd → for loading CSV
*   gradio as gr → for building the interactive web app & dashboard.
*   tensorflow / keras → for building and training deep learning models (classification, regression, Siamese).
*   numpy as np → for numerical operations.
*   sklearn.preprocessing.StandardScaler → scales your features so they have mean =  0 and standard deviation = 1  #this is a data preprocessing tool for dataset to make DLM accurate and stable, otherwise bias towards large numeric values.
*   sklearn.model_selection.train_test_split → splits data into training + testing validation sets.
*   matplotlib.pyplot as plt → creates charts and plots.
*   io.BytesIO + base64 → converts plots/images so they can be displayed in Gradio.
*   reportlab → generates PDF reports (eg. for pairwise ranking analysis).
*   tempfile → creates temporary files for storing charts and PDFs before download.
*   os → interact with operating system

In [26]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from io import BytesIO
import base64
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import os
import tempfile

# Load and prepare dataset
file_path = 'QSRanking.csv'

@st.cache_data
def load_data():
    df = pd.read_csv(file_path, encoding='latin1')
    if 'RANK_2024' in df.columns:
        df = df.drop(columns=['RANK_2024'])
    df['RANK_2025'] = pd.to_numeric(df['RANK_2025'], errors='coerce').fillna(1402)
    df['Top100'] = np.where(df['RANK_2025'] <= 100, 1, 0)
    feature_cols = ['Academic_Reputation_Score', 'Employer_Reputation_Score',
                    'Citations_per_Faculty_Score', 'Faculty_Student_Score',
                    'International_Faculty_Score']
    df = df.dropna(subset=feature_cols + ['Institution_Name', 'Top100'])
    return df, feature_cols

df, feature_cols = load_data()
X = df[feature_cols].values
y_class = df['Top100'].values
y_reg = df['RANK_2025'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train classification model
X_train, X_val, y_train_c, y_val_c = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)
model_class = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
model_class.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_class.fit(X_train, y_train_c, epochs=20, validation_data=(X_val, y_val_c), verbose=0)

# Train regression model
_, _, y_train_r, y_val_r = train_test_split(X_scaled, y_reg, test_size=0.2, random_state=42)
model_reg = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])
model_reg.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_reg.fit(X_train, y_train_r, epochs=20, validation_data=(X_val, y_val_r), verbose=0)


# Evaluate model
y_pred = model_reg.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

# Show evaluation metrics
st.header("📊 Regression Model Evaluation")
st.write("Model trained to predict QS RANK_2025 based on academic metrics.")

metrics_df = pd.DataFrame({
    'Metric': ['Mean Absolute Error (MAE)', 'Mean Squared Error (MSE)', 'R² Score'],
    'Value': [mae, mse, r2]
})
st.table(metrics_df)


# Siamese model
def build_siamese_model(input_dim):
    base = tf.keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(input_dim,)),
        layers.Dense(32, activation='relu')
    ])
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    encoded_a = base(input_a)
    encoded_b = base(input_b)
    diff = layers.Subtract()([encoded_a, encoded_b])
    out = layers.Dense(1, activation='sigmoid')(diff)
    return Model([input_a, input_b], out)

model_rank = build_siamese_model(X.shape[1])
model_rank.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

def create_pairs(X, y_rank):
    pairs_a, pairs_b, labels = [], [], []
    for i in range(len(X)):
        for j in range(i + 1, len(X)):
            pairs_a.append(X[i])
            pairs_b.append(X[j])
            labels.append(1 if y_rank[i] < y_rank[j] else 0)
    return np.array(pairs_a), np.array(pairs_b), np.array(labels)

X_a, X_b, y_rank_pair = create_pairs(X_scaled, y_reg)
model_rank.fit([X_a, X_b], y_rank_pair, epochs=5, batch_size=256, verbose=0)

# Streamlit UI
st.title("🌟 QS Ranking AI Dashboard")
st.write("Data source: QS World University Rankings 2025")
st.dataframe(df)

st.header("🔍 University Prediction")
univ_input = st.text_input("Enter university name:")
model_type = st.radio("Select model type:", ["Classification", "Regression"])
if st.button("Predict"):
    matched = df[df['Institution_Name'].str.lower().str.contains(univ_input.lower())]
    if matched.empty:
        st.error("University not found.")
    else:
        inst = matched.iloc[0]
        X_input = scaler.transform(inst[feature_cols].values.reshape(1, -1))
        if model_type == "Classification":
            prob = model_class.predict(X_input)[0][0] * 100
            expected = "HIGH" if inst['RANK_2025'] <= 100 else "LOW"
            st.success(f"🔢 RANK_2025: {inst['RANK_2025']} → Expected: {expected}")
            st.info(f"📈 Predicted Top 100 Probability: {prob:.1f}%")
        else:
            pred_rank = model_reg.predict(X_input)[0][0]
            percentage = (pred_rank / 1402) * 100
            st.success(f"🔢 Actual RANK_2025: {inst['RANK_2025']}")
            st.info(f"📈 Predicted RANK_2025: {pred_rank:.1f} ({percentage:.1f}%)")

st.header("🤝 Pairwise Ranking")
col1, col2 = st.columns(2)
univ_a = col1.text_input("University A")
univ_b = col2.text_input("University B")

if st.button("Compare and Generate PDF"):
    u1 = df[df['Institution_Name'].str.lower().str.contains(univ_a.lower())]
    u2 = df[df['Institution_Name'].str.lower().str.contains(univ_b.lower())]
    if u1.empty or u2.empty:
        st.error("One or both universities not found.")
    else:
        X1 = scaler.transform(u1.iloc[0][feature_cols].values.reshape(1, -1))
        X2 = scaler.transform(u2.iloc[0][feature_cols].values.reshape(1, -1))
        prob = model_rank.predict([X1, X2])[0][0] * 100
        better = u1.iloc[0]['Institution_Name'] if prob >= 50 else u2.iloc[0]['Institution_Name']

        vals1 = u1.iloc[0][feature_cols].values
        vals2 = u2.iloc[0][feature_cols].values
        x = np.arange(len(feature_cols))

        fig, ax = plt.subplots(figsize=(10, 6))
        ax.barh(x - 0.2, vals1, height=0.4, label=u1.iloc[0]['Institution_Name'])
        ax.barh(x + 0.2, vals2, height=0.4, label=u2.iloc[0]['Institution_Name'])
        ax.set_yticks(x)
        ax.set_yticklabels(feature_cols)
        ax.set_title("Feature Comparison")
        ax.legend()
        st.pyplot(fig)

        # Save PDF
        os.makedirs("reports", exist_ok=True)
        pdf_path = "reports/ranking_report.pdf"
        c = canvas.Canvas(pdf_path, pagesize=letter)
        c.setFont("Helvetica", 12)
        text = c.beginText(40, 750)
        text.textLine("Pairwise Ranking Prediction")
        text.textLine(f"Predicted better ranked: {better}")
        text.textLine(f"Probability University A better: {prob:.1f}%")
        c.drawText(text)
        img_buf = BytesIO()
        fig.savefig(img_buf, format='png')
        img_buf.seek(0)
        tmp_img_path = "reports/tmp_chart.png"
        with open(tmp_img_path, 'wb') as f:
            f.write(img_buf.read())
        c.drawImage(tmp_img_path, 40, 300, width=500, preserveAspectRatio=True)
        c.save()
        os.remove(tmp_img_path)
        st.success("✅ PDF saved.")
        with open(pdf_path, "rb") as f:
            st.download_button("📄 Download PDF Report", f, file_name="ranking_report.pdf")

st.header("🏆 Top 10 Universities")
top10 = df.nsmallest(10, 'RANK_2025')
fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(top10['Institution_Name'], top10['RANK_2025'], color='green')
ax.set_xlabel("RANK_2025")
ax.set_title("Top 10 Universities by RANK_2025")
ax.invert_yaxis()
st.pyplot(fig)

NameError: name 'st' is not defined

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from io import BytesIO
import base64
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import os
import tempfile
import gradio as gr

# Load and prepare dataset
file_path = 'QSRanking (1).csv' # Update file path to the uploaded file name

# Moved data loading and preparation outside the Gradio interface function
# to avoid reloading and retraining models on each interaction
df = pd.read_csv(file_path, encoding='latin1')
if 'RANK_2024' in df.columns:
    df = df.drop(columns=['RANK_2024'])
df['RANK_2025'] = pd.to_numeric(df['RANK_2025'], errors='coerce').fillna(1402)
df['Top100'] = np.where(df['RANK_2025'] <= 100, 1, 0)
feature_cols = ['Academic_Reputation_Score', 'Employer_Reputation_Score',
                'Citations_per_Faculty_Score', 'Faculty_Student_Score',
                'International_Faculty_Score']
df = df.dropna(subset=feature_cols + ['Institution_Name', 'Top100'])

X = df[feature_cols].values
y_class = df['Top100'].values
y_reg = df['RANK_2025'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train classification model
X_train, X_val, y_train_c, y_val_c = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)
model_class = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
model_class.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_class.fit(X_train, y_train_c, epochs=20, validation_data=(X_val, y_val_c), verbose=0)

# Train regression model
X_train_r, X_val_r, y_train_r, y_val_r = train_test_split(X_scaled, y_reg, test_size=0.2, random_state=42)
model_reg = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])
model_reg.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_reg.fit(X_train_r, y_train_r, epochs=20, validation_data=(X_val_r, y_val_r), verbose=0)

# Siamese model
def build_siamese_model(input_dim):
    base = tf.keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(input_dim,)),
        layers.Dense(32, activation='relu')
    ])
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    encoded_a = base(input_a)
    encoded_b = base(input_b)
    diff = layers.Subtract()([encoded_a, encoded_b])
    out = layers.Dense(1, activation='sigmoid')(diff)
    return Model([input_a, input_b], out)

model_rank = build_siamese_model(X.shape[1])
model_rank.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

def create_pairs(X, y_rank):
    pairs_a, pairs_b, labels = [], [], []
    for i in range(len(X)):
        for j in range(i + 1, len(X)):
            pairs_a.append(X[i])
            pairs_b.append(X[j])
            labels.append(1 if y_rank[i] < y_rank[j] else 0)
    return np.array(pairs_a), np.array(pairs_b), np.array(labels)

X_a, X_b, y_rank_pair = create_pairs(X_scaled, y_reg)
model_rank.fit([X_a, X_b], y_rank_pair, epochs=5, batch_size=256, verbose=0)


# Gradio Interface Functions

def show_dataframe():
    return df

def predict_university(univ_input, model_type):
    matched = df[df['Institution_Name'].str.lower().str.contains(univ_input.lower())]
    if matched.empty:
        return "University not found.", "", ""
    else:
        inst = matched.iloc[0]
        X_input = scaler.transform(inst[feature_cols].values.reshape(1, -1))
        if model_type == "Classification":
            prob = model_class.predict(X_input)[0][0] * 100
            expected = "HIGH" if inst['RANK_2025'] <= 100 else "LOW"
            return f"🔢 RANK_2025: {inst['RANK_2025']} → Expected: {expected}", f"📈 Predicted Top 100 Probability: {prob:.1f}%", ""
        else:
            pred_rank = model_reg.predict(X_input)[0][0]
            percentage = (pred_rank / 1402) * 100
            return f"🔢 Actual RANK_2025: {inst['RANK_2025']}", f"📈 Predicted RANK_2025: {pred_rank:.1f} ({percentage:.1f}%)", ""

def compare_universities(univ_a, univ_b):
    u1 = df[df['Institution_Name'].str.lower().str.contains(univ_a.lower())]
    u2 = df[df['Institution_Name'].str.lower().str.contains(univ_b.lower())]
    if u1.empty or u2.empty:
        return None, "One or both universities not found.", None
    else:
        X1 = scaler.transform(u1.iloc[0][feature_cols].values.reshape(1, -1))
        X2 = scaler.transform(u2.iloc[0][feature_cols].values.reshape(1, -1))
        prob = model_rank.predict([X1, X2])[0][0] * 100
        better = u1.iloc[0]['Institution_Name'] if prob >= 50 else u2.iloc[0]['Institution_Name']

        vals1 = u1.iloc[0][feature_cols].values
        vals2 = u2.iloc[0][feature_cols].values
        x = np.arange(len(feature_cols))

        fig, ax = plt.subplots(figsize=(10, 6))
        ax.barh(x - 0.2, vals1, height=0.4, label=u1.iloc[0]['Institution_Name'])
        ax.barh(x + 0.2, vals2, height=0.4, label=u2.iloc[0]['Institution_Name'])
        ax.set_yticks(x)
        ax.set_yticklabels(feature_cols)
        ax.set_title("Feature Comparison")
        ax.legend()
        plt.tight_layout() # Adjust layout to prevent labels overlapping
        img_buf = BytesIO()
        fig.savefig(img_buf, format='png')
        img_buf.seek(0)
        plt.close(fig) # Close the figure to free memory
        img_base64 = base64.b64encode(img_buf.read()).decode('utf-8')
        img_html = f'<img src="data:image/png;base64,{img_base64}"/>'

        # Generate PDF
        os.makedirs("reports", exist_ok=True)
        pdf_path = "reports/ranking_report.pdf"
        c = canvas.Canvas(pdf_path, pagesize=letter)
        c.setFont("Helvetica", 12)
        text = c.beginText(40, 750)
        text.textLine("Pairwise Ranking Prediction")
        text.textLine(f"Predicted better ranked: {better}")
        text.textLine(f"Probability University A better: {prob:.1f}%")
        c.drawText(text)

        # Embed image in PDF (optional, but good for a report)
        # You'll need to save the matplotlib figure to a temporary file first
        # For simplicity, skipping embedding the image in the PDF for now
        # as Gradio handles displaying the image directly.

        c.save()

        return img_html, f"✅ PDF report generated: {pdf_path}", pdf_path


def show_top10():
    top10 = df.nsmallest(10, 'RANK_2025')
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.barh(top10['Institution_Name'], top10['RANK_2025'], color='green')
    ax.set_xlabel("RANK_2025")
    ax.set_title("Top 10 Universities by RANK_2025")
    ax.invert_yaxis()
    plt.tight_layout() # Adjust layout
    img_buf = BytesIO()
    fig.savefig(img_buf, format='png')
    img_buf.seek(0)
    plt.close(fig) # Close the figure
    img_base64 = base64.b64encode(img_buf.read()).decode('utf-8')
    img_html = f'<img src="data:image/png;base64,{img_base64}"/>'
    return img_html

# Gradio Interface Layout
with gr.Blocks() as demo:
    gr.Markdown("## 🌟 QS Ranking AI Dashboard")
    gr.Markdown("Data source: QS World University Rankings 2025")

    with gr.Tab("Data Overview"):
        gr.Markdown("### Raw Data")
        dataframe_output = gr.Dataframe(value=df, label="QS Ranking Data")

    with gr.Tab("University Prediction"):
        gr.Markdown("### 🔍 University Prediction")
        univ_input = gr.Textbox(label="Enter university name:")
        model_type = gr.Radio(["Classification", "Regression"], label="Select model type:")
        predict_button = gr.Button("Predict")
        prediction_output1 = gr.Textbox(label="Prediction Result 1")
        prediction_output2 = gr.Textbox(label="Prediction Result 2")


    with gr.Tab("Pairwise Ranking"):
        gr.Markdown("### 🤝 Pairwise Ranking")
        with gr.Row():
            univ_a = gr.Textbox(label="University A")
            univ_b = gr.Textbox(label="University B")
        compare_button = gr.Button("Compare and Generate PDF")
        comparison_image = gr.HTML(label="Feature Comparison Plot")
        comparison_message = gr.Textbox(label="Comparison Result")
        pdf_download = gr.File(label="Download PDF Report")


    with gr.Tab("Top 10 Universities"):
        gr.Markdown("### 🏆 Top 10 Universities")
        top10_image = gr.HTML(label="Top 10 Universities Plot")


    # Define interactions
    predict_button.click(
        predict_university,
        inputs=[univ_input, model_type],
        outputs=[prediction_output1, prediction_output2]
    )

    compare_button.click(
        compare_universities,
        inputs=[univ_a, univ_b],
        outputs=[comparison_image, comparison_message, pdf_download]
    )

    demo.load(
        show_top10,
        outputs=top10_image
    )

demo.launch(debug=True)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
