In [None]:

# Cell 1: Install and import dependencies
# !pip install requests pandas python-dotenv

import requests
import pandas as pd
import json
from datetime import datetime
import os
from dotenv import load_dotenv

load_dotenv()

GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
print("Ready to explore GitHub API!")
print(f"Authentication: {'‚úì Enabled' if GITHUB_TOKEN else '‚úó Disabled (60 req/hour limit)'}")

Ready to explore GitHub API!
Authentication: ‚úì Enabled


In [2]:
def check_rate_limit():
    """Check current API rate limit status"""
    url = "https://api.github.com/rate_limit"
    
    headers = {}
    if GITHUB_TOKEN:
        headers['Authorization'] = f'token {GITHUB_TOKEN}'
    
    response = requests.get(url, headers=headers)
    data = response.json()
    
    core = data['resources']['core']
    search = data['resources']['search']
    
    print("=== Rate Limit Status ===")
    print(f"Core API: {core['remaining']}/{core['limit']} remaining")
    print(f"Search API: {search['remaining']}/{search['limit']} remaining")
    print(f"Resets at: {datetime.fromtimestamp(core['reset'])}")
    
    return data

rate_info = check_rate_limit()

=== Rate Limit Status ===
Core API: 4997/5000 remaining
Search API: 30/30 remaining
Resets at: 2025-11-07 21:14:39


In [3]:
# Cell 3: Search for repositories by skill/keyword
def search_repos(query, sort='stars', per_page=10):
    """
    Search GitHub repositories
    
    Args:
        query: Search query (e.g., 'deep learning', 'react', 'python')
        sort: 'stars', 'forks', 'updated', 'help-wanted-issues'
        per_page: Number of results (max 100)
    """
    url = "https://api.github.com/search/repositories"
    
    params = {
        'q': query,
        'sort': sort,
        'order': 'desc',
        'per_page': per_page
    }
    
    headers = {
        'Accept': 'application/vnd.github.v3+json'
    }
    
    if GITHUB_TOKEN:
        headers['Authorization'] = f'token {GITHUB_TOKEN}'
    
    response = requests.get(url, params=params, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.json())
        return None

# Try searching for "deep learning"
results = search_repos('deep learning', per_page=5)

if results:
    print(f"Total count: {results['total_count']:,}")
    print(f"Returned: {len(results['items'])} repos\n")
    
    for i, repo in enumerate(results['items'], 1):
        print(f"{i}. {repo['full_name']}")
        print(f"   ‚≠ê {repo['stargazers_count']:,} stars | üç¥ {repo['forks_count']:,} forks")
        print(f"   {repo['html_url']}")
        print()

Total count: 359,224
Returned: 5 repos

1. labmlai/annotated_deep_learning_paper_implementations
   ‚≠ê 64,176 stars | üç¥ 6,506 forks
   https://github.com/labmlai/annotated_deep_learning_paper_implementations

2. keras-team/keras
   ‚≠ê 63,537 stars | üç¥ 19,647 forks
   https://github.com/keras-team/keras

3. scutan90/DeepLearning-500-questions
   ‚≠ê 56,786 stars | üç¥ 15,973 forks
   https://github.com/scutan90/DeepLearning-500-questions

4. coqui-ai/TTS
   ‚≠ê 43,346 stars | üç¥ 5,742 forks
   https://github.com/coqui-ai/TTS

5. deepspeedai/DeepSpeed
   ‚≠ê 40,633 stars | üç¥ 4,613 forks
   https://github.com/deepspeedai/DeepSpeed



In [4]:
# Cell 4 (Fixed): Extract useful information into a DataFrame
def repos_to_dataframe(search_results):
    """Convert search results to a clean DataFrame"""
    
    if not search_results or 'items' not in search_results:
        return pd.DataFrame()
    
    items = search_results.get('items', [])
    
    if len(items) == 0:
        return pd.DataFrame()
    
    repos_data = []
    
    for repo in items:
        repos_data.append({
            'full_name': repo['full_name'],
            'owner': repo['owner']['login'],
            'name': repo['name'],
            'url': repo['html_url'],
            'description': repo.get('description', ''),
            'stars': repo['stargazers_count'],
            'forks': repo['forks_count'],
            'watchers': repo['watchers_count'],
            'language': repo.get('language', 'Unknown'),
            'topics': ', '.join(repo.get('topics', [])),
            'created_at': repo['created_at'],
            'updated_at': repo['updated_at'],
            'size': repo['size'],  # KB
            'open_issues': repo['open_issues_count'],
            'license': repo['license']['name'] if repo.get('license') else 'No license',
            'default_branch': repo.get('default_branch', 'main')
        })
    
    df = pd.DataFrame(repos_data)
    
    # Only convert dates if DataFrame is not empty
    if not df.empty:
        # Convert dates
        df['created_at'] = pd.to_datetime(df['created_at'])
        df['updated_at'] = pd.to_datetime(df['updated_at'])
    
    return df

# Create DataFrame
df = repos_to_dataframe(results)
df.head()

Unnamed: 0,full_name,owner,name,url,description,stars,forks,watchers,language,topics,created_at,updated_at,size,open_issues,license,default_branch
0,labmlai/annotated_deep_learning_paper_implemen...,labmlai,annotated_deep_learning_paper_implementations,https://github.com/labmlai/annotated_deep_lear...,üßë‚Äçüè´ 60+ Implementations/tutorials of deep lear...,64176,6506,64176,Python,"attention, deep-learning, deep-learning-tutori...",2020-08-25 02:29:34+00:00,2025-11-07 21:43:21+00:00,156412,27,MIT License,master
1,keras-team/keras,keras-team,keras,https://github.com/keras-team/keras,Deep Learning for humans,63537,19647,63537,Python,"data-science, deep-learning, jax, machine-lear...",2015-03-28 00:35:42+00:00,2025-11-07 22:14:31+00:00,48534,265,Apache License 2.0,master
2,scutan90/DeepLearning-500-questions,scutan90,DeepLearning-500-questions,https://github.com/scutan90/DeepLearning-500-q...,Ê∑±Â∫¶Â≠¶‰π†500ÈóÆÔºå‰ª•ÈóÆÁ≠îÂΩ¢ÂºèÂØπÂ∏∏Áî®ÁöÑÊ¶ÇÁéáÁü•ËØÜ„ÄÅÁ∫øÊÄß‰ª£Êï∞„ÄÅÊú∫Âô®Â≠¶‰π†„ÄÅÊ∑±Â∫¶Â≠¶‰π†„ÄÅËÆ°ÁÆóÊú∫ËßÜËßâÁ≠âÁÉ≠ÁÇπ...,56786,15973,56786,JavaScript,,2018-06-27 06:36:45+00:00,2025-11-07 20:34:10+00:00,207074,120,GNU General Public License v3.0,master
3,coqui-ai/TTS,coqui-ai,TTS,https://github.com/coqui-ai/TTS,üê∏üí¨ - a deep learning toolkit for Text-to-Speec...,43346,5742,43346,Python,"deep-learning, glow-tts, hifigan, melgan, mult...",2020-05-20 15:45:28+00:00,2025-11-08 01:02:17+00:00,170196,11,Mozilla Public License 2.0,dev
4,deepspeedai/DeepSpeed,deepspeedai,DeepSpeed,https://github.com/deepspeedai/DeepSpeed,DeepSpeed is a deep learning optimization libr...,40633,4613,40633,Python,"billion-parameters, compression, data-parallel...",2020-01-23 18:35:18+00:00,2025-11-07 23:25:21+00:00,243501,1238,Apache License 2.0,master


In [5]:
# Cell 5: Explore different search strategies
def explore_skill(skill_name, num_results=20):
    """Deep dive into a specific skill"""
    
    print(f"üîç Exploring: {skill_name}\n")
    
    # Strategy 1: Simple keyword search
    print("Strategy 1: Simple keyword search")
    simple = search_repos(skill_name, per_page=num_results)
    df_simple = repos_to_dataframe(simple)
    
    # Strategy 2: Search in name, description, and topics
    print("Strategy 2: Targeted search (name, description, topics)")
    targeted_query = f'{skill_name} in:name,description,topics'
    targeted = search_repos(targeted_query, per_page=num_results)
    df_targeted = repos_to_dataframe(targeted)
    
    # Strategy 3: Filter by language if applicable
    if skill_name.lower() in ['python', 'javascript', 'java', 'go', 'rust']:
        print(f"Strategy 3: Language-specific search")
        lang_query = f'language:{skill_name}'
        lang_results = search_repos(lang_query, per_page=num_results)
        df_lang = repos_to_dataframe(lang_results)
    else:
        df_lang = pd.DataFrame()
    
    return {
        'simple': df_simple,
        'targeted': df_targeted,
        'language': df_lang
    }

# Test with a skill from your database
skill_results = explore_skill('machine learning', num_results=10)

print("\n=== Simple Search Top 5 ===")
print(skill_results['simple'][['full_name', 'stars', 'language']].head())

print("\n=== Targeted Search Top 5 ===")
print(skill_results['targeted'][['full_name', 'stars', 'topics']].head())

üîç Exploring: machine learning

Strategy 1: Simple keyword search
Strategy 2: Targeted search (name, description, topics)

=== Simple Search Top 5 ===
                               full_name   stars          language
0                  tensorflow/tensorflow  192331               C++
1               huggingface/transformers  152220            Python
2             microsoft/ML-For-Beginners   78722  Jupyter Notebook
3                  fighting41love/funNLP   77058            Python
4  josephmisiti/awesome-machine-learning   70510            Python

=== Targeted Search Top 5 ===
                               full_name   stars  \
0                  tensorflow/tensorflow  192331   
1               huggingface/transformers  152220   
2             microsoft/ML-For-Beginners   78722   
3                  fighting41love/funNLP   77058   
4  josephmisiti/awesome-machine-learning   70510   

                                              topics  
0  deep-learning, deep-neural-networks, distri

In [6]:
# Cell 6: Get detailed information about a specific repository
def get_repo_details(owner, repo_name):
    """Get detailed information about a specific repository"""
    
    url = f"https://api.github.com/repos/{owner}/{repo_name}"
    
    headers = {
        'Accept': 'application/vnd.github.v3+json'
    }
    
    if GITHUB_TOKEN:
        headers['Authorization'] = f'token {GITHUB_TOKEN}'
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Get details for a specific repo
repo_details = get_repo_details('tensorflow', 'tensorflow')

if repo_details:
    print(f"Repository: {repo_details['full_name']}")
    print(f"Description: {repo_details['description']}")
    print(f"Stars: {repo_details['stargazers_count']:,}")
    print(f"Forks: {repo_details['forks_count']:,}")
    print(f"Language: {repo_details['language']}")
    print(f"Topics: {', '.join(repo_details.get('topics', []))}")
    print(f"Homepage: {repo_details.get('homepage', 'N/A')}")
    print(f"Has Wiki: {repo_details['has_wiki']}")
    print(f"Has Issues: {repo_details['has_issues']}")

Repository: tensorflow/tensorflow
Description: An Open Source Machine Learning Framework for Everyone
Stars: 192,331
Forks: 74,965
Language: C++
Topics: deep-learning, deep-neural-networks, distributed, machine-learning, ml, neural-network, python, tensorflow
Homepage: https://tensorflow.org
Has Wiki: False
Has Issues: True


In [7]:
# Cell 7: Batch search multiple skills
def batch_search_skills(skills, top_n=5):
    """Search for multiple skills and compile results"""
    
    all_results = {}
    
    for skill in skills:
        print(f"Searching for: {skill}...")
        results = search_repos(skill, per_page=top_n)
        df = repos_to_dataframe(results)
        df['search_skill'] = skill  # Track which skill this came from
        all_results[skill] = df
    
    # Combine all results
    combined_df = pd.concat(all_results.values(), ignore_index=True)
    
    return combined_df

# Get skills from your database (simulated here)
sample_skills = [
    'python',
    'javascript', 
    'react',
    'machine learning',
    'docker'
]

batch_results = batch_search_skills(sample_skills, top_n=3)
print(f"\nTotal repos found: {len(batch_results)}")
print("\nSample of results:")
batch_results[['search_skill', 'full_name', 'stars', 'language']].head(10)

Searching for: python...
Searching for: javascript...
Searching for: react...
Searching for: machine learning...
Searching for: docker...

Total repos found: 15

Sample of results:


Unnamed: 0,search_skill,full_name,stars,language
0,python,donnemartin/system-design-primer,325796,Python
1,python,vinta/awesome-python,268235,Python
2,python,practical-tutorials/project-based-learning,249199,
3,javascript,freeCodeCamp/freeCodeCamp,431333,TypeScript
4,javascript,practical-tutorials/project-based-learning,249199,
5,javascript,facebook/react,240389,JavaScript
6,react,freeCodeCamp/freeCodeCamp,431333,TypeScript
7,react,facebook/react,240389,JavaScript
8,react,vercel/next.js,135510,JavaScript
9,machine learning,tensorflow/tensorflow,192331,C++


In [8]:
# Cell 8: Analyze what makes a good learning resource
from datetime import datetime, timezone

def analyze_repos(df):
    """Analyze repository characteristics"""
    
    print("=== Repository Analysis ===\n")
    
    # Language distribution
    print("Top Languages:")
    print(df['language'].value_counts().head())
    print()
    
    # Star distribution
    print("Star Statistics:")
    print(df['stars'].describe())
    print()
    
    # Most common topics
    all_topics = []
    for topics in df['topics'].dropna():
        if topics:
            all_topics.extend(topics.split(', '))
    
    topics_series = pd.Series(all_topics)
    print("Top Topics:")
    print(topics_series.value_counts().head(10))
    print()
    
    # Freshness (recently updated)
    # FIX: Use timezone-aware datetime
    now = datetime.now(timezone.utc)  # ‚Üê Changed this line
    df['days_since_update'] = (now - df['updated_at']).dt.days
    print("Update Recency (days since last update):")
    print(df['days_since_update'].describe())
    
    return df

# Analyze the batch results
analyzed = analyze_repos(batch_results)

=== Repository Analysis ===

Top Languages:
language
Python              4
JavaScript          3
TypeScript          2
C++                 1
Jupyter Notebook    1
Name: count, dtype: int64

Star Statistics:
count        15.000000
mean     222600.866667
std      112706.060694
min       78722.000000
25%      143865.000000
50%      240389.000000
75%      258717.000000
max      431333.000000
Name: stars, dtype: float64

Top Topics:
python              8
javascript          6
react               5
machine-learning    3
programming         3
education           3
nodejs              2
certification       2
curriculum          2
d3                  2
Name: count, dtype: int64

Update Recency (days since last update):
count    15.0
mean      0.0
std       0.0
min       0.0
25%       0.0
50%       0.0
75%       0.0
max       0.0
Name: days_since_update, dtype: float64


In [9]:
# Cell 9: Filter for "good learning resources"
from datetime import datetime, timezone

def filter_quality_repos(df, min_stars=100, max_age_days=365, has_topics=True):
    """
    Filter for high-quality learning resources
    
    Criteria:
    - Minimum star count (popular)
    - Recently updated (maintained)
    - Has topics (well-documented)
    - Has description (clear purpose)
    """
    
    filtered = df.copy()
    
    # Filter by stars
    filtered = filtered[filtered['stars'] >= min_stars]
    
    # Filter by recency
    if 'days_since_update' not in filtered.columns:
        now = datetime.now(timezone.utc)
        filtered['days_since_update'] = (now - filtered['updated_at']).dt.days
    filtered = filtered[filtered['days_since_update'] <= max_age_days]
    
    # Filter by topics
    if has_topics:
        filtered = filtered[filtered['topics'].str.len() > 0]
    
    # Filter by description
    filtered = filtered[filtered['description'].str.len() > 10]
    
    print(f"Original repos: {len(df)}")
    print(f"After quality filter: {len(filtered)}")
    print(f"Filtered out: {len(df) - len(filtered)} repos")
    
    return filtered.sort_values('stars', ascending=False)

quality_repos = filter_quality_repos(batch_results, min_stars=500)
quality_repos[['full_name', 'stars', 'days_since_update', 'topics']].head(10)

Original repos: 15
After quality filter: 15
Filtered out: 0 repos


Unnamed: 0,full_name,stars,days_since_update,topics
3,freeCodeCamp/freeCodeCamp,431333,0,"careers, certification, community, curriculum,..."
6,freeCodeCamp/freeCodeCamp,431333,0,"careers, certification, community, curriculum,..."
0,donnemartin/system-design-primer,325796,0,"design, design-patterns, design-system, develo..."
1,vinta/awesome-python,268235,0,"awesome, collections, python, python-framework..."
2,practical-tutorials/project-based-learning,249199,0,"beginner-project, cpp, golang, javascript, pro..."
4,practical-tutorials/project-based-learning,249199,0,"beginner-project, cpp, golang, javascript, pro..."
5,facebook/react,240389,0,"declarative, frontend, javascript, library, re..."
7,facebook/react,240389,0,"declarative, frontend, javascript, library, re..."
9,tensorflow/tensorflow,192331,0,"deep-learning, deep-neural-networks, distribut..."
12,ohmyzsh/ohmyzsh,182554,0,"cli, cli-app, oh-my-zsh, oh-my-zsh-plugin, oh-..."


In [10]:
# Cell 11: Test the exact query you'll use in production
def production_search_query(skill_name):
    """
    The exact search pattern we'll use in Airflow
    Optimized for finding learning resources
    """
    
    # Build a compound query
    queries_to_try = [
        f'{skill_name} in:name,description,topics',
        f'{skill_name} tutorial',
        f'{skill_name} awesome',  # "Awesome" lists are great resources
    ]
    
    all_repos = []
    
    for query in queries_to_try:
        print(f"Query: {query}")
        results = search_repos(query, per_page=5)
        if results:
            df = repos_to_dataframe(results)
            df['query_used'] = query
            all_repos.append(df)
    
    if all_repos:
        combined = pd.concat(all_repos, ignore_index=True)
        # Remove duplicates (same repo from different queries)
        combined = combined.drop_duplicates(subset=['full_name'])
        return combined
    else:
        return pd.DataFrame()

# Test with a skill from your table
test_skill = "Deep Learning"
production_results = production_search_query(test_skill)

print(f"\nFound {len(production_results)} unique repos for '{test_skill}'")
production_results[['full_name', 'stars', 'query_used']].head(10)

Query: Deep Learning in:name,description,topics
Query: Deep Learning tutorial
Query: Deep Learning awesome

Found 12 unique repos for 'Deep Learning'


Unnamed: 0,full_name,stars,query_used
0,labmlai/annotated_deep_learning_paper_implemen...,64176,"Deep Learning in:name,description,topics"
1,keras-team/keras,63537,"Deep Learning in:name,description,topics"
2,scutan90/DeepLearning-500-questions,56786,"Deep Learning in:name,description,topics"
3,coqui-ai/TTS,43346,"Deep Learning in:name,description,topics"
4,deepspeedai/DeepSpeed,40633,"Deep Learning in:name,description,topics"
6,yunjey/pytorch-tutorial,31898,Deep Learning tutorial
7,ChristosChristofidis/awesome-deep-learning,26490,Deep Learning tutorial
8,ujjwalkarn/Machine-Learning-Tutorials,17117,Deep Learning tutorial
9,Mikoto10032/DeepLearning,16776,Deep Learning tutorial
10,ashishpatel26/500-AI-Machine-learning-Deep-lea...,28612,Deep Learning awesome


In [11]:
# Cell 11 (Fixed): Production-ready search optimized for learning
def production_search_query(skill_name):
    """
    Optimized search for job seekers and learners
    Prioritizes: tutorials, examples, awesome lists, interview prep
    """
    
    # Build queries that find LEARNING resources, not source code
    queries_to_try = [
        # Awesome lists (curated resources)
        f'awesome {skill_name}',
        
        # Tutorial repositories
        f'{skill_name} tutorial',
        f'{skill_name} learn',
        f'{skill_name} course',
        
        # Project examples
        f'{skill_name} examples',
        f'{skill_name} projects',
        
        # Interview preparation
        f'{skill_name} interview',
        f'{skill_name} practice',
        
        # General learning (with filters to avoid source code)
        f'{skill_name} in:name,description,topics NOT language:C++ NOT language:C'
    ]
    
    all_repos = []
    
    for query in queries_to_try:
        print(f"Query: {query}")
        try:
            results = search_repos(query, per_page=5)
            if results and results.get('items'):
                df = repos_to_dataframe(results)
                if not df.empty:  # Only add if we got results
                    df['query_used'] = query
                    df['query_type'] = query.split()[0]  # 'awesome', 'tutorial', etc.
                    all_repos.append(df)
                    print(f"  ‚úì Found {len(df)} repos")
                else:
                    print(f"  ‚úó No results")
            else:
                print(f"  ‚úó No results")
        except Exception as e:
            print(f"  ‚úó Error: {e}")
            continue
    
    if all_repos:
        combined = pd.concat(all_repos, ignore_index=True)
        # Remove duplicates
        combined = combined.drop_duplicates(subset=['full_name'])
        
        # Score repos by learning value
        combined = score_learning_value(combined)
        
        return combined.sort_values('learning_score', ascending=False)
    else:
        print("‚ö† No results found for any query")
        return pd.DataFrame()

def score_learning_value(df):
    """
    Score repositories by how useful they are for learning/job prep
    Higher score = better learning resource
    """
    
    if df.empty:
        return df
    
    df = df.copy()
    df['learning_score'] = 0.0
    
    # Bonus points for educational keywords in name/description
    educational_keywords = [
        'tutorial', 'learn', 'course', 'guide', 'beginner',
        'awesome', 'examples', 'projects', 'interview', 'practice',
        'bootcamp', 'workshop', 'introduction', 'roadmap'
    ]
    
    for keyword in educational_keywords:
        df.loc[df['full_name'].str.lower().str.contains(keyword, na=False), 'learning_score'] += 10
        df.loc[df['description'].str.lower().str.contains(keyword, na=False), 'learning_score'] += 5
    
    # Penalize source code repos
    source_keywords = ['implementation', 'core', 'framework', 'engine', 'library']
    for keyword in source_keywords:
        df.loc[df['full_name'].str.lower().str.contains(keyword, na=False), 'learning_score'] -= 5
    
    # Bonus for stars (popularity = likely quality)
    df['learning_score'] += (df['stars'] / 1000).clip(0, 20)  # Max 20 bonus points
    
    # Calculate days_since_update if not already there
    if 'days_since_update' not in df.columns:
        from datetime import datetime, timezone
        now = datetime.now(timezone.utc)
        df['days_since_update'] = (now - df['updated_at']).dt.days
    
    # Bonus for recent updates (maintained)
    df.loc[df['days_since_update'] < 180, 'learning_score'] += 10
    df.loc[df['days_since_update'] < 90, 'learning_score'] += 5
    
    # Bonus for having good documentation indicators
    df.loc[df['topics'].str.contains('tutorial|education|learning', case=False, na=False), 'learning_score'] += 15
    
    # Penalize if it's a language's core repo (e.g., 'python/cpython')
    for idx, row in df.iterrows():
        parts = row['full_name'].split('/')
        if len(parts) == 2 and parts[0].lower() == parts[1].lower():
            df.loc[idx, 'learning_score'] -= 20
    
    return df

# Test with deep learning
test_skill = "deep learning"
production_results = production_search_query(test_skill)

if not production_results.empty:
    print(f"\nFound {len(production_results)} unique repos for '{test_skill}'")
    print("\n=== Top Learning Resources ===")
    print(production_results[['full_name', 'stars', 'query_type', 'learning_score']].head(15))
else:
    print("\n‚ö† No results to display")

Query: awesome deep learning
  ‚úì Found 5 repos
Query: deep learning tutorial
  ‚úì Found 5 repos
Query: deep learning learn
  ‚úì Found 5 repos
Query: deep learning course
  ‚úì Found 5 repos
Query: deep learning examples
  ‚úì Found 5 repos
Query: deep learning projects
  ‚úì Found 5 repos
Query: deep learning interview
  ‚úì Found 5 repos
Query: deep learning practice
  ‚úì Found 5 repos
Query: deep learning in:name,description,topics NOT language:C++ NOT language:C
  ‚úó No results

Found 34 unique repos for 'deep learning'

=== Top Learning Resources ===
                                            full_name  stars query_type  \
1          ChristosChristofidis/awesome-deep-learning  26490    awesome   
0   ashishpatel26/500-AI-Machine-learning-Deep-lea...  28612    awesome   
12     floodsung/Deep-Learning-Papers-Reading-Roadmap  39359       deep   
3               ujjwalkarn/Machine-Learning-Tutorials  17117    awesome   
2                terryum/awesome-deep-learning-papers  260

In [12]:
import psycopg2

In [13]:
PG_CONFIG = {
    "host": os.getenv("DB_HOST"),
    "port": int(os.getenv("DB_PORT", 5432)),
    "database": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
}

In [32]:
conn = psycopg2.connect(**PG_CONFIG)

cursor = conn.cursor()

skills_df = pd.read_sql("SELECT * FROM role_skills_by_title", conn)

print(f"Loaded {len(skills_df)} rows")
print(f"Columns: {skills_df.columns.tolist()}")
print("\nFirst few rows:")
display(skills_df.head())

  skills_df = pd.read_sql("SELECT * FROM role_skills_by_title", conn)


Loaded 4531 rows
Columns: ['title_lc', 'skills_for_role']

First few rows:


Unnamed: 0,title_lc,skills_for_role
0,software engineer - london,"[""agile"", ""aws"", ""clientside development"", ""co..."
1,senior .net software engineer - biotech instru...,"[""aws"", ""biology"", ""c#"", ""chemistry"", ""cloud c..."
2,sr. software engineer (starshield) - top secre...,"[""adaptability"", ""aerospace"", ""algorithm devel..."
3,amd-xilinx csp embedded software engineer (fl/...,"[""arm"", ""bsd sockets"", ""c"", ""c++"", ""embedded s..."
4,senior information assurance data analyst & ne...,"[""8570.01m./dod 8140"", ""amazon cloud services""..."


In [44]:
# Cell: Top 25 with categorization
import json
from collections import Counter

def extract_skills_frequency(df, skills_column='skills_for_role'):
    """
    Extract all skills and count their frequency
    """
    skill_counter = Counter()
    
    for skills_str in df[skills_column]:
        if pd.isna(skills_str):
            continue
        
        try:
            # Parse JSON string to list
            skills_list = json.loads(skills_str)
            
            # Update counter
            skill_counter.update(skills_list)
            
        except json.JSONDecodeError as e:
            print(f"Error parsing: {skills_str[:50]}... - {e}")
            continue
    
    return skill_counter

def categorize_skill(skill_name):
    """
    Categorize a skill as technical, soft, or domain
    """
    skill_lower = skill_name.lower()
    
    # Technical skills indicators
    technical_indicators = [
        'python', 'java', 'sql', 'aws', 'docker', 'kubernetes', 'git',
        'react', 'angular', 'node', 'c++', 'c#', 'linux', 'api', 'cloud',
        'database', 'devops', 'ci/cd', 'terraform', 'jenkins', 'spark',
        'hadoop', 'kafka', 'nosql', 'javascript', 'typescript', 'html',
        'css', 'azure', 'gcp', 'machine learning', 'ml', 'data science',
        'programming', 'software', 'development', 'engineering', 'coding'
    ]
    
    # Soft skills indicators
    soft_indicators = [
        'communication', 'leadership', 'teamwork', 'collaboration',
        'problem solving', 'problemsolving', 'creativity', 'adaptability',
        'mentoring', 'coaching', 'presentation', 'time management',
        'organization', 'analytical', 'detail', 'initiative', 'agile'
    ]
    
    # Check for matches
    if any(indicator in skill_lower for indicator in technical_indicators):
        return 'technical'
    elif any(indicator in skill_lower for indicator in soft_indicators):
        return 'soft'
    else:
        return 'other'

# Extract and categorize
skill_counter = extract_skills_frequency(skills_df)

top_25_with_category = pd.DataFrame([
    {
        'skill_name': skill,
        'frequency': count,
        'category': categorize_skill(skill)
    }
    for skill, count in skill_counter.most_common(100)
])

print(f"Total unique skills: {len(skill_counter)}")
print(f"\n{'='*70}")
print("TOP 25 MOST IN-DEMAND SKILLS (with categories)")
print(f"{'='*70}\n")
print(top_25_with_category.to_string(index=False))

# Summary by category
print(f"\n{'='*40}")
print("Category Breakdown in Top 25:")
print(f"{'='*40}")
print(top_25_with_category['category'].value_counts())



Total unique skills: 54004

TOP 25 MOST IN-DEMAND SKILLS (with categories)

             skill_name  frequency  category
                 python       2153 technical
                    sql       1510 technical
                   java       1267 technical
   software engineering       1160 technical
          data analysis       1041     other
          communication        999      soft
   software development        953 technical
                    c++        915 technical
                    aws        899 technical
       machine learning        830 technical
                  linux        731 technical
                    git        721 technical
       computer science        717     other
             javascript        717 technical
     data visualization        715     other
                  agile        699      soft
        problem solving        697      soft
   communication skills        687      soft
             kubernetes        647 technical
                 docker 

In [46]:
top_25_with_category[top_25_with_category['category'] == 'soft']

Unnamed: 0,skill_name,frequency,category
5,communication,999,soft
15,agile,699,soft
16,problem solving,697,soft
17,communication skills,687,soft
20,teamwork,621,soft
22,collaboration,558,soft
23,problemsolving,551,soft
32,analytical skills,455,soft
43,attention to detail,382,soft
46,leadership,371,soft


In [None]:
YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY')

In [52]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [53]:
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

In [54]:
def check_quota_status():
    """
    Test API connection and show quota info
    
    YouTube Data API quota:
    - 10,000 units per day
    - Search costs 100 units
    - Video details cost 1 unit
    - So you can do ~100 searches per day
    """
    try:
        # Simple test query (costs 100 units)
        request = youtube.search().list(
            part='snippet',
            q='communication skills',
            type='video',
            maxResults=1
        )
        response = request.execute()
        
        print("‚úì API connection successful!")
        print("\nQuota Info:")
        print("- Daily quota: 10,000 units")
        print("- Search cost: 100 units")
        print("- Video stats cost: 1 unit")
        print("- Estimated searches available: ~100 per day")
        
        return True
    except HttpError as e:
        print(f"‚úó API Error: {e}")
        return False

check_quota_status()

‚úì API connection successful!

Quota Info:
- Daily quota: 10,000 units
- Search cost: 100 units
- Video stats cost: 1 unit
- Estimated searches available: ~100 per day


True

In [56]:
# Cell 5: Test different search strategies for better results

def explore_search_strategies(skill_name):
    """
    Try different search queries to find best learning content
    """
    
    search_strategies = [
        f'{skill_name} skills tutorial',
        f'how to improve {skill_name}',
        f'{skill_name} training',
        f'{skill_name} for professionals',
        f'effective {skill_name} skills',
    ]
    
    all_videos = []
    
    for query in search_strategies:
        print(f"üîç Searching: {query}")
        
        try:
            search_request = youtube.search().list(
                part='snippet',
                q=query,
                type='video',
                order='relevance',
                maxResults=5,
                videoDuration='medium',
                relevanceLanguage='en'
            )
            
            response = search_request.execute()
            
            if 'items' in response:
                print(f"   ‚úì Found {len(response['items'])} videos")
                
                for item in response['items']:
                    all_videos.append({
                        'video_id': item['id']['videoId'],
                        'title': item['snippet']['title'],
                        'channel': item['snippet']['channelTitle'],
                        'url': f"https://www.youtube.com/watch?v={item['id']['videoId']}",
                        'query_used': query
                    })
            else:
                print(f"   ‚úó No results")
                
        except HttpError as e:
            print(f"   ‚úó Error: {e}")
    
    # Remove duplicates by video_id
    unique_videos = {v['video_id']: v for v in all_videos}.values()
    
    return pd.DataFrame(unique_videos)

# Test different strategies
strategy_results = explore_search_strategies("communication")

print(f"\n{'='*80}")
print(f"Found {len(strategy_results)} unique videos across all strategies")
print(f"{'='*80}\n")
print(strategy_results[['title', 'channel', 'query_used']].to_string(index=False))

üîç Searching: communication skills tutorial
   ‚úì Found 5 videos
üîç Searching: how to improve communication
   ‚úì Found 5 videos
üîç Searching: communication training
   ‚úì Found 5 videos
üîç Searching: communication for professionals
   ‚úì Found 5 videos
üîç Searching: effective communication skills
   ‚úì Found 5 videos

Found 11 unique videos across all strategies

                                                                                             title                    channel                      query_used
                       Give me 8 minutes, and I&#39;ll improve your communication skills by 88%...                Jak Piggott  effective communication skills
                       Listen to this if you want to level up your communication skills in 2025...                 Vinh Giang  effective communication skills
Top 5 Tips to Improve Communication Skills | Soft Skills For Beginners | Soft Skills | Simplilearn                Simplilearn   communication sk