In [9]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("merged_postings.csv")

# Define columns to keep
columns_to_keep = [
    "company_name", "title", "description", "max_salary", "pay_period", "location", 
    "med_salary", "min_salary", "formatted_work_type", "remote_allowed", "application_type",
    "expiry", "formatted_experience_level", "skills_desc", "posting_domain", "sponsored", 
    "work_type", "currency", "compensation_type", "normalized_salary", "zip_code", 
    "industry_name", "skill_name", "type", "industry", "speciality"
]

# Keep only required columns
df = df[columns_to_keep]

In [10]:
# Convert salary fields
df["min_salary"] = df["min_salary"].apply(lambda x: f"min_{int(x)}" if not pd.isna(x) else "")
df["max_salary"] = df["max_salary"].apply(lambda x: f"max_{int(x)}" if not pd.isna(x) else "")
df["med_salary"] = df["med_salary"].apply(lambda x: f"med_{int(x)}" if not pd.isna(x) else "")

# Convert boolean fields
df["remote_allowed"] = df["remote_allowed"].apply(lambda x: "remote_yes" if x else "remote_no")
df["sponsored"] = df["sponsored"].apply(lambda x: "sponsored_yes" if x else "")

In [11]:
import re

def format_row(row):
    formatted_text = "\n".join([f"{col}: {row[col]}" for col in columns_to_keep if pd.notna(row[col]) and row[col] != ""])
    # Remove special characters except letters, numbers, _, and -
    formatted_text = re.sub(r"[^a-zA-Z0-9_\- \n]", "", formatted_text.lower())  
    return formatted_text

# Apply formatting and cleaning
df["formatted_text"] = df.apply(format_row, axis=1)

# Create new DataFrame with cleaned job postings
formatted_df = pd.DataFrame({"job_posting": df["formatted_text"]})

In [None]:
# Save the new DataFrame to CSV
formatted_df.to_csv("formatted_postings.csv", index=False)

# Print first 5 formatted job postings
print("\n\n".join(formatted_df["job_posting"].head().tolist()))

Make data tech specific