In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [6]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Base URL for RemoteOK engineer jobs
base_url = 'https://remoteok.com/remote-engineer-jobs'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}

job_list = []
page = 1  # Start from the first page
max_jobs = 200  # We want exactly 100 jobs

# Scrape until we collect 100 jobs
while len(job_list) < max_jobs:
    url = f"{base_url}?page={page}" if page > 1 else base_url
    print(f"Scraping page {page}: {url}")

    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}. Status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        jobs = soup.find_all('tr', class_='job')

        # Stop if no jobs found on the page
        if not jobs:
            print("No more jobs found.")
            break

        for job in jobs:
            # Stop if we've reached 100 jobs
            if len(job_list) >= max_jobs:
                break

            # Extract job data
            title_tag = job.find('h2', {'itemprop': 'title'})
            company_tag = job.find('h3', {'itemprop': 'name'})
            
            # Initialize metadata with default values
            location = 'N/A'
            salary = 'Not Listed'
            date_posted = 'N/A'
            
            # Parse location and salary from metadata
            metadata = job.find_all('div', class_='location')
            for meta in metadata:
                text = meta.text.strip().lower()
                if '$' in text:  # Salary field
                    salary = meta.text.strip()
                elif any(x in text for x in ['est', 'pst', 'cst', 'gmt', 'utc']):  # Timezone hints
                    continue  # Skip timezone info
                else:  # Location (could also be date in some cases)
                    location = meta.text.strip()
            
            # Try alternative ways to find the date
            # Method 1: Look for time tag
            time_tag = job.find('time')
            if time_tag:
                date_posted = time_tag.text.strip()
            else:
                # Method 2: Look for "X days ago" pattern in any element
                for elem in job.find_all(text=True):
                    if 'ago' in elem.lower():
                        date_posted = elem.strip()
                        break

            job_data = {
                'title': title_tag.text.strip() if title_tag else 'N/A',
                'company': company_tag.text.strip() if company_tag else 'N/A',
                'location': location,
                'salary': salary,
                'date_posted': date_posted
            }
            job_list.append(job_data)
            print(f"Collected job {len(job_list)}: {job_data['title']} | Posted: {job_data['date_posted']}")

        page += 1
        time.sleep(2)  # Be polite with the server

    except Exception as e:
        print(f"Error occurred: {e}")
        break

# Save exactly 100 jobs to CSV
with open('remote_jobs.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['title', 'company', 'location', 'salary', 'date_posted'])
    writer.writeheader()
    writer.writerows(job_list[:max_jobs])  # Ensure we only save 100 jobs

print(f"\nSuccessfully saved {len(job_list[:max_jobs])} jobs to remote_jobs.csv")

Scraping page 1: https://remoteok.com/remote-engineer-jobs
Collected job 1: Software Engineer | Posted: 1d
Collected job 2: Senior Software Engineer KSPM | Posted: 1d
Collected job 3: Software Engineer III | Posted: 1d
Collected job 4: Back End Developer Winna.com | Posted: 9d
Collected job 5: Software Engineer Infrastructure | Posted: 2d
Collected job 6: Software Engineer Data | Posted: 2d
Collected job 7: Software Engineer III Frontend | Posted: 2d
Collected job 8: HQ Senior Frontend Engineer | Posted: 2d
Collected job 9: Software Engineer Microservices | Posted: 2d
Collected job 10: Senior Fullstack Engineer | Posted: 3d
Collected job 11: Senior React Full stack Developer | Posted: 3d
Collected job 12: Technical Product Manager Developer Platform | Posted: 3d
Collected job 13: DevOps Engineer | Posted: 4d
Collected job 14: Principal Software Engineer | Posted: 6d
Collected job 15: Staff Backend Engineer | Posted: 7d
Collected job 16: Software Engineer | Posted: 8d
Collected job 17: 

In [42]:
df=pd.read_csv("remote_jobs.csv")

In [44]:
import re
# Clean columns using regex
df["location"] = df["location"].str.replace(r"^[^a-zA-Z]+", "", regex=True)
df

Unnamed: 0,title,company,location,salary,date_posted
0,Software Engineer,Upvest,Probably worldwide,💰 $70k - $120k*,1d
1,Senior Software Engineer KSPM,Cast AI,Probably worldwide,💰 $70k - $120k*,1d
2,Software Engineer III,Jack Henry,Probably worldwide,💰 $55k - $100k*,1d
3,Back End Developer Winna.com,Winna,Worldwide,💰 $40k - $80k,9d
4,Software Engineer Infrastructure,Gauntlet,Worldwide,💰 $60k - $80k*,2d
...,...,...,...,...,...
195,Software Engineer,Serotonin,Probably worldwide,💰 $70k - $120k*,8d
196,Solidity Engineer Smart Contracts Engineer,Sei Labs,Probably worldwide,💰 $55k - $100k*,8d
197,Full Stack Engineer,Stealth,Contractor,💰 $60k - $180k,16d
198,Staff Software Engineer Metrics US,Weights & Biases,United States,💰 $60k - $80k*,10d


In [52]:
def clean_salary(s):
    s = re.sub(r"[^\d\skK\-\$]", "", s)
    matches = re.findall(r"(\d+)[kK]", s)
    if len(matches) == 2:
        low = int(matches[0]) * 1000
        high = int(matches[1]) * 1000
        return f"{low}${'-'}{high}$"
    return None

df["cleaned_salary"] = df["salary"].apply(clean_salary)
df

Unnamed: 0,title,company,location,salary,date_posted,cleaned_salary
0,Software Engineer,Upvest,Probably worldwide,💰 $70k - $120k*,1d,70000$-120000$
1,Senior Software Engineer KSPM,Cast AI,Probably worldwide,💰 $70k - $120k*,1d,70000$-120000$
2,Software Engineer III,Jack Henry,Probably worldwide,💰 $55k - $100k*,1d,55000$-100000$
3,Back End Developer Winna.com,Winna,Worldwide,💰 $40k - $80k,9d,40000$-80000$
4,Software Engineer Infrastructure,Gauntlet,Worldwide,💰 $60k - $80k*,2d,60000$-80000$
...,...,...,...,...,...,...
195,Software Engineer,Serotonin,Probably worldwide,💰 $70k - $120k*,8d,70000$-120000$
196,Solidity Engineer Smart Contracts Engineer,Sei Labs,Probably worldwide,💰 $55k - $100k*,8d,55000$-100000$
197,Full Stack Engineer,Stealth,Contractor,💰 $60k - $180k,16d,60000$-180000$
198,Staff Software Engineer Metrics US,Weights & Biases,United States,💰 $60k - $80k*,10d,60000$-80000$


In [54]:
df = df.drop("salary", axis=1)

In [56]:
df

Unnamed: 0,title,company,location,date_posted,cleaned_salary
0,Software Engineer,Upvest,Probably worldwide,1d,70000$-120000$
1,Senior Software Engineer KSPM,Cast AI,Probably worldwide,1d,70000$-120000$
2,Software Engineer III,Jack Henry,Probably worldwide,1d,55000$-100000$
3,Back End Developer Winna.com,Winna,Worldwide,9d,40000$-80000$
4,Software Engineer Infrastructure,Gauntlet,Worldwide,2d,60000$-80000$
...,...,...,...,...,...
195,Software Engineer,Serotonin,Probably worldwide,8d,70000$-120000$
196,Solidity Engineer Smart Contracts Engineer,Sei Labs,Probably worldwide,8d,55000$-100000$
197,Full Stack Engineer,Stealth,Contractor,16d,60000$-180000$
198,Staff Software Engineer Metrics US,Weights & Biases,United States,10d,60000$-80000$


In [58]:
df = df.rename(columns={"cleaned_salary": "Salary in $"})

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        200 non-null    object
 1   company      200 non-null    object
 2   location     200 non-null    object
 3   date_posted  200 non-null    object
 4   Salary in $  200 non-null    object
dtypes: object(5)
memory usage: 7.9+ KB


In [62]:
from datetime import datetime, timedelta

today = datetime.today()

def convert_to_date(x):
     days = int(x.replace("d", ""))
     return (today - timedelta(days=days)).strftime('%Y-%m-%d')

# Apply conversion
df["Posted_date"] = df["date_posted"].apply(convert_to_date)

df

Unnamed: 0,title,company,location,date_posted,Salary in $,Posted_date
0,Software Engineer,Upvest,Probably worldwide,1d,70000$-120000$,2025-04-24
1,Senior Software Engineer KSPM,Cast AI,Probably worldwide,1d,70000$-120000$,2025-04-24
2,Software Engineer III,Jack Henry,Probably worldwide,1d,55000$-100000$,2025-04-24
3,Back End Developer Winna.com,Winna,Worldwide,9d,40000$-80000$,2025-04-16
4,Software Engineer Infrastructure,Gauntlet,Worldwide,2d,60000$-80000$,2025-04-23
...,...,...,...,...,...,...
195,Software Engineer,Serotonin,Probably worldwide,8d,70000$-120000$,2025-04-17
196,Solidity Engineer Smart Contracts Engineer,Sei Labs,Probably worldwide,8d,55000$-100000$,2025-04-17
197,Full Stack Engineer,Stealth,Contractor,16d,60000$-180000$,2025-04-09
198,Staff Software Engineer Metrics US,Weights & Biases,United States,10d,60000$-80000$,2025-04-15


In [64]:
df = df.drop("date_posted", axis=1)

In [66]:
df

Unnamed: 0,title,company,location,Salary in $,Posted_date
0,Software Engineer,Upvest,Probably worldwide,70000$-120000$,2025-04-24
1,Senior Software Engineer KSPM,Cast AI,Probably worldwide,70000$-120000$,2025-04-24
2,Software Engineer III,Jack Henry,Probably worldwide,55000$-100000$,2025-04-24
3,Back End Developer Winna.com,Winna,Worldwide,40000$-80000$,2025-04-16
4,Software Engineer Infrastructure,Gauntlet,Worldwide,60000$-80000$,2025-04-23
...,...,...,...,...,...
195,Software Engineer,Serotonin,Probably worldwide,70000$-120000$,2025-04-17
196,Solidity Engineer Smart Contracts Engineer,Sei Labs,Probably worldwide,55000$-100000$,2025-04-17
197,Full Stack Engineer,Stealth,Contractor,60000$-180000$,2025-04-09
198,Staff Software Engineer Metrics US,Weights & Biases,United States,60000$-80000$,2025-04-15


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        200 non-null    object
 1   company      200 non-null    object
 2   location     200 non-null    object
 3   Salary in $  200 non-null    object
 4   Posted_date  200 non-null    object
dtypes: object(5)
memory usage: 7.9+ KB


In [70]:
df["Posted_date"] = pd.to_datetime(df["Posted_date"])

In [72]:
df['Salary in $'].unique()

array(['70000$-120000$', '55000$-100000$', '40000$-80000$',
       '60000$-80000$', '75000$-120000$', '63000$-85000$',
       '68000$-98000$', '60000$-230000$', '63000$-88000$',
       '65000$-85000$', '65000$-120000$', '70000$-90000$',
       '60000$-180000$'], dtype=object)

In [74]:
df["Salary in $"] = df["Salary in $"].str.replace("$", "", regex=False)

In [76]:
df

Unnamed: 0,title,company,location,Salary in $,Posted_date
0,Software Engineer,Upvest,Probably worldwide,70000-120000,2025-04-24
1,Senior Software Engineer KSPM,Cast AI,Probably worldwide,70000-120000,2025-04-24
2,Software Engineer III,Jack Henry,Probably worldwide,55000-100000,2025-04-24
3,Back End Developer Winna.com,Winna,Worldwide,40000-80000,2025-04-16
4,Software Engineer Infrastructure,Gauntlet,Worldwide,60000-80000,2025-04-23
...,...,...,...,...,...
195,Software Engineer,Serotonin,Probably worldwide,70000-120000,2025-04-17
196,Solidity Engineer Smart Contracts Engineer,Sei Labs,Probably worldwide,55000-100000,2025-04-17
197,Full Stack Engineer,Stealth,Contractor,60000-180000,2025-04-09
198,Staff Software Engineer Metrics US,Weights & Biases,United States,60000-80000,2025-04-15


In [78]:
def get_mean_salary(s):
    low, high = map(int, s.split('-'))
    return (low + high) / 2

df["expected_salary"] = df["Salary in $"].apply(get_mean_salary)

In [80]:
df

Unnamed: 0,title,company,location,Salary in $,Posted_date,expected_salary
0,Software Engineer,Upvest,Probably worldwide,70000-120000,2025-04-24,95000.0
1,Senior Software Engineer KSPM,Cast AI,Probably worldwide,70000-120000,2025-04-24,95000.0
2,Software Engineer III,Jack Henry,Probably worldwide,55000-100000,2025-04-24,77500.0
3,Back End Developer Winna.com,Winna,Worldwide,40000-80000,2025-04-16,60000.0
4,Software Engineer Infrastructure,Gauntlet,Worldwide,60000-80000,2025-04-23,70000.0
...,...,...,...,...,...,...
195,Software Engineer,Serotonin,Probably worldwide,70000-120000,2025-04-17,95000.0
196,Solidity Engineer Smart Contracts Engineer,Sei Labs,Probably worldwide,55000-100000,2025-04-17,77500.0
197,Full Stack Engineer,Stealth,Contractor,60000-180000,2025-04-09,120000.0
198,Staff Software Engineer Metrics US,Weights & Biases,United States,60000-80000,2025-04-15,70000.0


In [82]:
df[["min_salary", "max_salary"]] = df["Salary in $"].str.split('-', expand=True).astype(int)
df

Unnamed: 0,title,company,location,Salary in $,Posted_date,expected_salary,min_salary,max_salary
0,Software Engineer,Upvest,Probably worldwide,70000-120000,2025-04-24,95000.0,70000,120000
1,Senior Software Engineer KSPM,Cast AI,Probably worldwide,70000-120000,2025-04-24,95000.0,70000,120000
2,Software Engineer III,Jack Henry,Probably worldwide,55000-100000,2025-04-24,77500.0,55000,100000
3,Back End Developer Winna.com,Winna,Worldwide,40000-80000,2025-04-16,60000.0,40000,80000
4,Software Engineer Infrastructure,Gauntlet,Worldwide,60000-80000,2025-04-23,70000.0,60000,80000
...,...,...,...,...,...,...,...,...
195,Software Engineer,Serotonin,Probably worldwide,70000-120000,2025-04-17,95000.0,70000,120000
196,Solidity Engineer Smart Contracts Engineer,Sei Labs,Probably worldwide,55000-100000,2025-04-17,77500.0,55000,100000
197,Full Stack Engineer,Stealth,Contractor,60000-180000,2025-04-09,120000.0,60000,180000
198,Staff Software Engineer Metrics US,Weights & Biases,United States,60000-80000,2025-04-15,70000.0,60000,80000


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   title            200 non-null    object        
 1   company          200 non-null    object        
 2   location         200 non-null    object        
 3   Salary in $      200 non-null    object        
 4   Posted_date      200 non-null    datetime64[ns]
 5   expected_salary  200 non-null    float64       
 6   min_salary       200 non-null    int32         
 7   max_salary       200 non-null    int32         
dtypes: datetime64[ns](1), float64(1), int32(2), object(4)
memory usage: 11.1+ KB


In [90]:
df.to_csv("remote_jobs_clean.csv", index=False, encoding='utf-8')