In [1]:
import pandas as pd
import json

# Function to load a JSON file
def load_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

# Helper function to extract skill names from a list of dictionaries
def extract_skills(skills_list):
    if isinstance(skills_list, list):
        return [skill['name'] for skill in skills_list if isinstance(skill, dict) and 'name' in skill]
    return []

# Helper function to extract skills from a list of strings
def extract_skills_from_list(skills_list):
    if isinstance(skills_list, list):
        return [str(skill) for skill in skills_list]
    return []


In [2]:
# --- Process jobs.json ---
jobs_data = load_json_file('jobs.json')
if jobs_data:
    df_jobs = pd.DataFrame(jobs_data)
    df_jobs['skills'] = df_jobs['skills'].apply(extract_skills)
    df_jobs['skills'] = df_jobs['skills'].apply(lambda x: ', '.join(x))
    df_jobs_cleaned = df_jobs[['title', 'description_text', 'skills']].rename(
        columns={'title': 'Job Title', 'description_text': 'Job Description', 'skills': 'Skills'})
    df_jobs_cleaned['Source'] = 'jobs.json'
else:
    df_jobs_cleaned = pd.DataFrame()

# --- Process 2025-08-20 7P.M.json ---
upwork_jobs_data = load_json_file('2025-08-20 7P.M..json')
if upwork_jobs_data:
    df_upwork = pd.DataFrame(upwork_jobs_data)
    df_upwork['skills'] = df_upwork['skills'].apply(extract_skills)
    df_upwork['skills'] = df_upwork['skills'].apply(lambda x: ', '.join(x))
    df_upwork_cleaned = df_upwork[['title', 'description_text', 'skills']].rename(
        columns={'title': 'Job Title', 'description_text': 'Job Description', 'skills': 'Skills'})
    df_upwork_cleaned['Source'] = '2025-08-20 7P.M..json'
else:
    df_upwork_cleaned = pd.DataFrame()

# --- Process linkedin_jobs.json ---
linkedin_jobs_data = load_json_file('linkedin_jobs.json')
if linkedin_jobs_data:
    df_linkedin = pd.DataFrame(linkedin_jobs_data)
    # Use 'linkedin_org_description' as a proxy for 'Job Description'
    # Use 'linkedin_org_specialties' as a proxy for 'Skills'
    df_linkedin_cleaned = df_linkedin[['title', 'linkedin_org_description', 'linkedin_org_specialties']].rename(
        columns={'title': 'Job Title', 'linkedin_org_description': 'Job Description', 'linkedin_org_specialties': 'Skills'})
    df_linkedin_cleaned['Skills'] = df_linkedin_cleaned['Skills'].fillna('').apply(extract_skills_from_list)
    df_linkedin_cleaned['Skills'] = df_linkedin_cleaned['Skills'].apply(lambda x: ', '.join(x))
    df_linkedin_cleaned['Source'] = 'linkedin_jobs.json'
else:
    df_linkedin_cleaned = pd.DataFrame()



In [3]:
# Concatenate all dataframes
final_df = pd.concat([df_jobs_cleaned, df_upwork_cleaned, df_linkedin_cleaned], ignore_index=True)

final_df.head()

Unnamed: 0,Job Title,Job Description,Skills,Source
0,Need expert Virtual Assistant for account help...,We are looking for a highly experienced and de...,"eBay, Amazon",jobs.json
1,Deutsche Darsteller:innen für Business-Testimo...,Beschreibung:\nWir sind ein deutsches Start-up...,"Female, Male, Senior Adult, German, English, M...",jobs.json
2,Senior Dev - NFT Marketplace,We’re looking for a talented full stack blockc...,"Blockchain Architecture, JavaScript",jobs.json
3,Prospect List/Sourcing Specialist for High-Vol...,We are looking for an experienced Prospect Lis...,"Prospect List, Lead Generation, Market Researc...",jobs.json
4,Long-Term Web Designer (WordPress) with Strong...,We’re looking to build a long-term relationshi...,"Website Redesign, Custom Web Design, Theme Cus...",jobs.json


In [5]:
final_df.isna().sum()

Job Title          0
Job Description    8
Skills             0
Source             0
dtype: int64

In [6]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Job Title        300 non-null    object
 1   Job Description  292 non-null    object
 2   Skills           300 non-null    object
 3   Source           300 non-null    object
dtypes: object(4)
memory usage: 9.5+ KB


In [7]:
# Save the final DataFrame to a CSV file
output_file = 'extracted_job_data.csv'
final_df.to_csv(output_file, index=False)
print(f"\nFinal DataFrame saved to {output_file}")


Final DataFrame saved to extracted_job_data.csv
