In [6]:
!pip install sentence_transformers
!pip install tensorflow scikit-learn
!pip install umap

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25l[?25hdone
  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3542 sha256=2d5d4528f8d1634f1cbb5b21ecd3715afab7809298db1425b623b8996d1045e8
  Stored in directory: /root/.cache/pip/wheels/15/f1/28/53dcf7a309118ed35d810a5f9cb995217800f3f269ab5771cb
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [5]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import umap
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model



In [6]:
file_path = '/content/AgencyData.xlsx'
df = pd.read_excel(file_path)

df['features'] = df.apply(lambda row: ' '.join([
    str(row['Project Abstract']),
    str(row['Tech Stack']),
    str(row['Total Budget']),
    str(row['Project Duration (days)'])
]), axis=1)

In [7]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
df['features_text'] = df['features'].apply(lambda x: ' '.join(x.split()[:512]))
bert_embeddings = model.encode(df['features_text'].tolist(), show_progress_bar=True)

pca = PCA(n_components=50, random_state=42)
pca_result = pca.fit_transform(bert_embeddings)

input_dim = pca_result.shape[1]
encoding_dim = 10

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(pca_result, pca_result, epochs=50, batch_size=256, shuffle=True, validation_split=0.2, verbose=0)
encoded_features = encoder.predict(pca_result)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]



In [16]:
umap_params = {
    'n_neighbors': [5, 10, 15, 20, 25, 30],
    'min_dist': [0.1, 0.2, 0.3, 0.5, 0.7, 0.9],
    'n_components': [5, 10, 20],
    'metric': ['euclidean', 'cosine', 'manhattan']
}

best_umap_result = None
best_silhouette = -1
best_params = None

for n_neighbors in umap_params['n_neighbors']:
    for min_dist in umap_params['min_dist']:
        for n_components in umap_params['n_components']:
            for metric in umap_params['metric']:
                umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric, random_state=42)
                umap_result = umap_model.fit_transform(encoded_features)
                silhouette_avg = silhouette_score(umap_result, KMeans(n_clusters=10, random_state=42, n_init=10).fit_predict(umap_result))
                if silhouette_avg > best_silhouette:
                    best_silhouette = silhouette_avg
                    best_umap_result = umap_result
                    best_params = (n_neighbors, min_dist, n_components, metric)


umap_result = best_umap_result

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.

In [17]:
def kmeans_grid_search(X):
    param_grid = {
        'n_clusters': [10, 15, 20, 25, 30],
        'init': ['k-means++', 'random'],
        'max_iter': [300, 600, 900],
        'n_init': [10, 20, 30],
        'random_state': [42]
    }
    kmeans = KMeans()
    grid_search = GridSearchCV(estimator=kmeans, param_grid=param_grid, cv=3, n_jobs=-1, scoring=make_scorer(silhouette_scorer))
    grid_search.fit(X)
    return grid_search.best_params_


best_params_umap = kmeans_grid_search(umap_result)

  pid = os.fork()
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [18]:
kmeans_best_umap = KMeans(**best_params_umap)
kmeans_best_umap.fit(umap_result)
df['best_umap_cluster'] = kmeans_best_umap.labels_

silhouette_avg_best_umap = silhouette_score(umap_result, kmeans_best_umap.labels_)

In [19]:
def get_similar_projects(project_name, df, umap_result, cluster_column):
    cluster = df[df['Project Name'] == project_name][cluster_column].values[0]
    cluster_projects = df[df[cluster_column] == cluster]
    project_index = cluster_projects[cluster_projects['Project Name'] == project_name].index[0]

    cosine_sim_cluster = cosine_similarity(umap_result[cluster_projects.index], umap_result[cluster_projects.index])
    sim_scores = list(enumerate(cosine_sim_cluster[project_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    project_indices = [cluster_projects.index[i[0]] for i in sim_scores]
    return df['Project Name'].iloc[project_indices]


project_name = 'EduCraft: Teacher Mastery Program'
similar_projects_kmeans = get_similar_projects(project_name, df, umap_result, 'best_umap_cluster')


print("Best UMAP parameters:", best_params)
print("Similar projects using K-means with UMAP:")
print(similar_projects_kmeans)
print("Silhouette Score for K-means:", silhouette_avg_best_umap)

Best UMAP parameters: (5, 0.1, 5, 'cosine')
Similar projects using K-means with UMAP:
17     SafeHaven School Security Enhancement Initiative
0           LiteraVate: Literacy Empowerment Initiative
5                 WisdomCraft Lifelong Learning Program
8           EduShift Transformation Catalyst Initiative
19    EduSpark Innovative Teaching and Learning Program
Name: Project Name, dtype: object
Silhouette Score for K-means: 0.57080156
