In [None]:
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud

In [None]:
df = pd.read_csv('data/processed/cleaned_jobs_sample.csv')
df['cleaned_skills'] = df['cleaned_skills'].apply(ast.literal_eval)

In [None]:
tech_keywords = ['machine learning', 'ml', 'deep learning', 'dl', 'data scientist', 'nlp', 'pytorch', 'bert', 'hugging face', 'transformers', 'scikit-learn', 'faiss']
df_tech = df[df['job_title'].str.lower().str.contains('|'.join(tech_keywords)) | 
             df['cleaned_summary'].str.lower().str.contains('|'.join(tech_keywords)) | 
             df['cleaned_skills'].apply(lambda skills: any(kw in ' '.join(skills) for kw in tech_keywords))]
print(f"Filtered to {len(df_tech)} tech jobs")
df_tech.to_csv('data/processed/cleaned_jobs_tech.csv', index=False)

In [None]:
print(df_tech.info())
sns.countplot(y='job_level', data=df_tech)
plt.title('Job Levels')
plt.show()

all_skills = [skill for sublist in df_tech['cleaned_skills'] for skill in sublist]
top_skills = Counter(all_skills).most_common(10)
skills_df = pd.DataFrame(top_skills, columns=['Skill', 'Count'])
sns.barplot(x='Count', y='Skill', data=skills_df)
plt.title('Top Skills')
plt.show()

wordcloud = WordCloud().generate(' '.join(df_tech['job_location'].dropna()))
plt.imshow(wordcloud)
plt.title('Locations')
plt.show()

In [None]:
roles = ['Machine Learning Engineer', 'Data Scientist', 'AI Researcher']
levels = ['Associate', 'Mid senior']
skills = ['pytorch nlp bert embeddings', 'hugging face transformers scikit-learn faiss', 'ethical ai bias mitigation fairlearn']
locs = ['San Francisco CA', 'New York NY', 'Remote']
prompts = [f"{random.choice(roles)} job: {random.choice(levels)} level requiring {random.choice(skills)} for AI projects in {random.choice(locs)}." for _ in range(5000)]
synthetic_summaries = [generate_synthetic_summary(p) for p in prompts]

df_aug = pd.DataFrame({
    'job_link': ['synthetic_' + str(i) for i in range(5000)],
    'job_title': [p.split(':')[0] for p in prompts],
    'company': ['Synthetic Tech Co']*5000,
    'job_location': [p.split(' in ')[-1].rstrip('.') for p in prompts],
    'job_level': [p.split(' level ')[0].split(' ')[-1] for p in prompts],
    'job_type': ['Full-time']*5000,
    'cleaned_summary': synthetic_summaries,
    'cleaned_skills': [skills[0].split() for _ in range(5000)]  # List format
})
df_tech_aug = pd.concat([df_tech, df_aug], ignore_index=True)
df_tech_aug.to_csv('data/processed/cleaned_jobs_tech_aug.csv', index=False)

In [None]:
# EDA on df_tech_aug
print(df_tech_aug.info())
sns.countplot(y='job_level', data=df_tech_aug)
plt.title('Job Levels (Aug)')
plt.show()

all_skills = [skill for sublist in df_tech_aug['cleaned_skills'] for skill in sublist]
top_skills = Counter(all_skills).most_common(10)
skills_df = pd.DataFrame(top_skills, columns=['Skill', 'Count'])
sns.barplot(x='Count', y='Skill', data=skills_df)
plt.title('Top Skills (Aug)')
plt.show()

wordcloud = WordCloud().generate(' '.join(df_tech_aug['job_location'].dropna()))
plt.imshow(wordcloud)
plt.title('Locations (Aug)')
plt.show()