In [7]:
!pip install PyGithub sentence-transformers scikit-learn tqdm




In [16]:
from github import Github

# Use your token here
GITHUB_TOKEN = "YOUR_ACCES_TOKEN"
g = Github(GITHUB_TOKEN)
user = g.get_user()
print(f"Authenticated as: {user.login}")



Authenticated as: MuhammadMoiz1


In [9]:
user_repos = list(user.get_repos())
languages = set()

for repo in user_repos:
    try:
        if repo.language:
            languages.add(repo.language)
    except:
        continue

print(f"Languages used by the user: {languages}")


Languages used by the user: {'Python', 'C#', 'C', 'HTML', 'JavaScript', 'EJS', 'Jupyter Notebook'}


In [10]:
from tqdm import tqdm

top_repos = []
seen_repo_ids = set()

for lang in languages:
    query = f"language:{lang} stars:>100"
    results = g.search_repositories(query=query, sort='stars', order='desc')
    for repo in results[:30]:  # Adjust to control how many per language
        if repo.id not in seen_repo_ids:
            top_repos.append(repo)
            seen_repo_ids.add(repo.id)
        if len(top_repos) >= 100:
            break
    if len(top_repos) >= 100:
        break

print(f"Collected {len(top_repos)} repositories for recommendation pool.")


Collected 100 repositories for recommendation pool.


In [11]:
readmes = []
repo_meta = []

for repo in tqdm(top_repos):
    try:
        readme = repo.get_readme()
        content = readme.decoded_content.decode()
        readmes.append(content)
        repo_meta.append({
            "name": repo.full_name,
            "url": repo.html_url,
            "description": repo.description or "",
            "language": repo.language,
            "topics": repo.get_topics()
        })
    except:
        continue

print(f"Readmes fetched: {len(readmes)}")


100%|██████████| 100/100 [01:42<00:00,  1.03s/it]

Readmes fetched: 100





In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(readmes)

bert_model = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings = bert_model.encode(readmes, show_progress_bar=True)

# Combine vectors
combined_vectors = np.hstack([tfidf_matrix.toarray(), bert_embeddings])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
# Get README content of user's own repos
user_readmes = []
for repo in user_repos:
    try:
        readme = repo.get_readme()
        user_readmes.append(readme.decoded_content.decode())
    except:
        continue

# Get average user embedding
if not user_readmes:
    raise ValueError("User has no accessible READMEs to base recommendations on.")

user_tfidf = tfidf.transform(user_readmes).toarray()
user_bert = bert_model.encode(user_readmes)

user_vector = np.mean(np.hstack([user_tfidf, user_bert]), axis=0).reshape(1, -1)

# Cosine similarity
similarities = cosine_similarity(user_vector, combined_vectors)[0]
top_indices = np.argsort(similarities)[::-1][:10]


In [14]:
print("🔍 Top Recommended Repositories:\n")
for idx in top_indices:
    meta = repo_meta[idx]
    print(f"- {meta['name']} ({meta['language']})")
    print(f"  {meta['url']}")
    print(f"  📄 {meta['description']}\n")


🔍 Top Recommended Repositories:

- josephmisiti/awesome-machine-learning (Python)
  https://github.com/josephmisiti/awesome-machine-learning
  📄 A curated list of awesome Machine Learning frameworks, libraries and software.

- AUTOMATIC1111/stable-diffusion-webui (Python)
  https://github.com/AUTOMATIC1111/stable-diffusion-webui
  📄 Stable Diffusion web UI

- thangchung/awesome-dotnet-core (C#)
  https://github.com/thangchung/awesome-dotnet-core
  📄 :honeybee: A collection of awesome .NET core libraries, tools, frameworks and software

- tensorflow/models (Python)
  https://github.com/tensorflow/models
  📄 Models and examples built with TensorFlow

- hacksider/Deep-Live-Cam (Python)
  https://github.com/hacksider/Deep-Live-Cam
  📄 real time face swap and one-click video deepfake with only a single image

- jasontaylordev/CleanArchitecture (C#)
  https://github.com/jasontaylordev/CleanArchitecture
  📄 Clean Architecture Solution Template for ASP.NET Core

- DevToys-app/DevToys (C#)
  ht