In [None]:
# %pip install pydantic requests feedparser python-dateutil beautifulsoup4 feedparser
# %pip install fastapi uvicorn
# %pip install slowapi redis[hiredis] slowapi[redis]
%pip install prometheus_client

### Main Function

In [None]:
from sources.rss_feeds import RSSFeeds
from storage import save_to_json

In [None]:
rss_urls = [
    "https://weworkremotely.com/categories/remote-programming-jobs.rss",
    "https://remotive.com/remote-jobs/feed",
    "https://remoteok.com/remote-python-jobs.rss"
]

source = RSSFeeds()
all_jobs = []
for url in rss_urls:
    jobs = source.fetch(url)
    all_jobs.extend(jobs)

save_to_json(all_jobs)
print(f"Saved {len(all_jobs)} jobs")

## Remotive

In [None]:
def extract_technical_section(text: str) -> str:
    sections = [
        "technical requirements",
        "technical skills",
        "requirements",
        "ideal candidate"
        "nice to have",
        "qualifications",
    ]
    
    lower = text.lower()
    for section in sections:
        if section in lower:
            start = lower.index(section)
            return text[start:start + 2000]  # grab next chunk
    
    return text

In [None]:
import re

SKILLS_VOCAB = [
    # Programming Languages
    "swift", "python", "java", "kotlin", "objective-c",
    "c", "c++", "javascript", "typescript", "go", "rust",
    "ruby", "php", "sql", "bash", "rails", "dart", "tailwind css", "css",

    # Mobile Platforms
    "ios", "android", "ipados", "macos", "watchos", "tvos",

    # Apple Frameworks
    "swiftui", "uikit", "combine",
    "core data", "core animation", "core graphics",
    "avfoundation", "mapkit", "storekit", "healthkit",
    "icloud", "push notifications",

    # Android Frameworks
    "jetpack compose", "android studio", "gradle",
    "hilt", "dagger", "retrofit", "coroutines", "flutter"

    # Web & Backend Frameworks
    "react", "react native", "vue", "angular",
    "node.js", "express", "next.js",
    "django", "flask", "fastapi", "spring boot",

    # APIs & Protocols
    "rest api", "graphql", "websockets",
    "oauth", "openid connect", "jwt", "restful",

    # Data Engineering
    "apache airflow", "airflow",
    "dbt",
    "apache spark", "spark",
    "apache kafka", "kafka",
    "apache flink",
    "apache beam",
    "hadoop",
    "snowflake", "bigquery", "redshift",
    "databricks",
    "delta lake",
    "parquet", "avro",
    "data warehouse",
    "data lake",

    # Databases & Storage
    "sqlite", "postgresql", "mysql",
    "mongodb", "redis", "firebase",

    # DevOps & Cloud Platforms
    "aws", "gcp", "azure",
    "docker", "kubernetes",
    "terraform", "serverless",

    # CI/CD & Version Control
    "git", "github", "gitlab",
    "ci/cd", "fastlane", "jenkins", "selenium"

    # Development Tools
    "xcode", "visual studio code",
    "intellij idea", "postman",

    # Software Engineering Practices (mainstream)
    "unit testing", "integration testing",
    "dependency injection",
    "multithreading", "concurrency",
    "solid principles",

    # Architecture (mainstream)
    "mvc", "mvvm", "viper",
    "microservices", "monolith",
    "distributed systems",

    # Security
    "https", "tls", "ssl",
    "authentication", "authorization",
    "encryption", "keychain",

    # Monitoring & Analytics Tools
    "sentry", "datadog",
    "firebase analytics",

    # AI / Machine Learning
    "machine learning", "deep learning",
    "computer vision", "natural language processing",
    "nlp", "reinforcement learning", "cv", 

    # GenAI & Agentic AI
    "large language models", "llm", "llms",
    "prompt engineering",
    "retrieval augmented generation",
    "rag", "langchain",
    "agentic ai",
    "autonomous agents",
    "tool calling",

    # ML Frameworks & Platforms
    "tensorflow", "pytorch", "keras",
    "scikit-learn",
    "hugging face",
    "onnx", "elasticsearch",

    # AI Infrastructure
    "model serving",
    "vector databases",
    "embedding models",

    # Data Science & Analysis
    "data analysis", "data visualization",
    "pandas", "numpy", "matplotlib", "seaborn",
    "Looker", "tableau", "power bi"
]


In [None]:
def extract_skills(text: str, skills_vocab: list[str]) -> list[str]:
    text = text.lower()
    found_skills = set()

    for skill in skills_vocab:
        # escape special characters like "+"
        pattern = r"\b" + re.escape(skill.lower()) + r"\b"
        if re.search(pattern, text):
            found_skills.add(skill)

    return sorted(found_skills)

In [None]:
from bs4 import BeautifulSoup

def skills_finder(text: str):
    soup = BeautifulSoup(text, "html.parser")

    # Remove script/style/img tags if present
    for tag in soup(["script", "style", "img"]):
        tag.decompose()

    # Extract readable text
    job_description = soup.get_text(separator="\n", strip=True)
    tech_section = extract_technical_section(job_description)

    description = tech_section if tech_section else job_description
    skills = extract_skills(description, SKILLS_VOCAB)

    return skills if skills else "None"

### Remotive API Fetching

In [None]:
import requests
from dateutil import parser

API_URL = "https://remotive.com/api/remote-jobs"

In [None]:
queries = [
    "Software Development",
    "AI / ML",
    "DevOps / Sysadmin",
    "Data Analysis"
]

data = []

for query in queries:
    response = requests.get(API_URL, params={"category": query})
    response.raise_for_status()
    jobs = response.json().get("jobs", [])
    data.extend(jobs)

print(data)

In [None]:
for item in data:

    skills = skills_finder(item["description"])

    print("---- Job Posting ----")
    print(item["title"])
    print(item["company_name"])
    print(item["url"])
    print(item["candidate_required_location"])
    print(parser.parse(item["publication_date"]))
    # if skills:
    print("Extracted Skills:", skills)
    # else:
    print("\nDescription:", item["description"][:200])
    print('\n')

### Remotive RSS Fetching

In [None]:
import feedparser

feed_url = "https://remotive.com/remote-jobs/feed"
feed = feedparser.parse(feed_url)

print(f"Feed title: {feed.feed.title}")
print(f"Number of entries: {len(feed.entries)}")

for entry in feed.entries:
    if entry.get("tags", "N/A")[0]['term'] not in ["Writing", "Sales / Business", "Marketing", "All others", "Education"]:
        skills = skills_finder(entry.get("summary", ""))

        print("-" * 40)
        print("Title:", entry.title)
        print("Link:", entry.link)
        print("Company:", entry.get("author", "N/A"))
        print("Location:", entry.get("location", "N/A"))
        print("Tags:", entry.get("tags", "N/A")[0]['term'])
        print("Published:", entry.get("published", "N/A"))
        # print("Summary:", entry.get("summary", "")[:200])
        print("Extracted Skills:", skills)
        print('\n')        

## Indeed Alternatives

In [None]:
import feedparser

rss_url = "https://remoteok.com/remote-python-jobs.rss"
feed = feedparser.parse(rss_url)
print(len(feed.entries))

for entry in feed.entries[:5]:
    print(entry.title, "-", entry.link)

In [None]:
rss_url = f"https://weworkremotely.com/categories/remote-programming-jobs.rss"
feed = feedparser.parse(rss_url)
print(len(feed.entries))

for entry in feed.entries[:5]:
    print(entry.title, "-", entry.link)

In [None]:
jobs = []
for entry in feed.entries:
    skills = skills_finder(entry.get("summary", ""))
    # company_name, job_title = entry.title.split(":", 1)
    
    print("-" * 40)
    print("Title:", entry.title)
    print("Link:", entry.link)
    print("Company:", entry.get("company", "N/A"))
    print("Location:", entry.get("location", "N/A"))
    print("Tags:", entry.get("tags", "N/A")[0]['term'])
    print("Published:", parser.parse(entry.get("published", "N/A")))
    print("Extracted Skills:", skills)
    print("Summary:", entry.get("summary", ""))
    print('\n')