In [None]:
#Importing libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, jsonify
from transformers import BertTokenizer, BertModel
import torch

In [None]:
#Importing the dataset
df = pd.read_csv('All The Universities of Pakistan.csv')

In [None]:
#Preprocessing the dataset column
df['description'] = df['description'].fillna('').str.lower().str.replace('[^\w\s]', '')

In [None]:
#Initializing BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
#Function for BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

#Apply BERT embeddings to descriptions
df['bert_embedding'] = df['description'].apply(get_bert_embeddings)

In [None]:
#Function that recommend universities
def recommend_universities(user_interest, num_recommendations=5):
    user_embedding = get_bert_embeddings(user_interest)
    similarities = cosine_similarity(user_embedding, np.vstack(df['bert_embedding'].values)).flatten()
    similar_indices = similarities.argsort()[::-1][:num_recommendations]
    recommendations = df.iloc[similar_indices]
    recommendations['similarity_score'] = similarities[similar_indices]
    recommendations['explanation'] = recommendations['description'].apply(
        lambda x: 'Matches your interest in ' + ', '.join(set(user_interest.lower().split()) & set(x.split())))
    return recommendations[['university', 'description', 'world_rank', 'country', 'similarity_score', 'explanation']]