In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from collections import Counter
import re

# Set Seaborn theme
sns.set(style="whitegrid")

# Load dataset
df = pd.read_csv("/kaggle/input/1000-ml-jobs-us/1000_ml_jobs_us.csv")

# --- Basic Data Overview ---
print("Shape of data:", df.shape)
print("Missing values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)
display(df.head())

# --- Convert posting date to datetime ---
df["job_posted_date"] = pd.to_datetime(df["job_posted_date"], errors='coerce')

# --- Feature Engineering: Extract Year, Month, Day, Weekday ---
df["year"] = df["job_posted_date"].dt.year
df["month"] = df["job_posted_date"].dt.month_name()
df["day"] = df["job_posted_date"].dt.day
df["weekday"] = df["job_posted_date"].dt.day_name()

# --- Job Postings per Month ---
month_order = ["January", "February", "March", "April", "May", "June", 
               "July", "August", "September", "October", "November", "December"]
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x="month", order=month_order, palette="viridis")
plt.xticks(rotation=45)
plt.title("Job Postings per Month")
plt.xlabel("Month")
plt.ylabel("Number of Jobs")
plt.tight_layout()
plt.show()

# --- Job Postings per Weekday ---
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x="weekday", order=weekday_order, palette="coolwarm")
plt.title("Job Postings by Day of the Week")
plt.xlabel("Weekday")
plt.ylabel("Number of Jobs")
plt.tight_layout()
plt.show()

# --- Job Postings per Day of Month ---
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x="day", bins=31, kde=False, color="steelblue")
plt.title("Job Postings per Day of Month")
plt.xlabel("Day of Month")
plt.ylabel("Number of Jobs")
plt.tight_layout()
plt.show()

# --- Heatmap: Jobs by Month and Weekday ---
heatmap_data = df.groupby(["month", "weekday"]).size().reset_index(name='count')
pivot_table = heatmap_data.pivot("weekday", "month", "count").reindex(index=weekday_order, columns=month_order)

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, annot=True, fmt='g', cmap="YlGnBu")
plt.title("Job Postings by Weekday and Month")
plt.tight_layout()
plt.show()

# --- Top Locations, Companies, Titles ---
top_locations = df["company_address_locality"].value_counts().head(10)
top_companies = df["company_name"].value_counts().head(10)
top_titles = df["job_title"].value_counts().head(10)

print("Top Job Locations:\n", top_locations)
print("\nTop Hiring Companies:\n", top_companies)
print("\nTop Job Titles:\n", top_titles)

# --- Top Words in Job Titles ---
text = ' '.join(df['job_title'].dropna().astype(str).tolist())
words = re.findall(r'\b\w+\b', text.lower())
word_freq = Counter(words)
top_words = pd.DataFrame(word_freq.most_common(20), columns=['Word', 'Frequency'])

# Bar Plot: Word Frequency
plt.figure(figsize=(12, 6))
sns.barplot(x="Frequency", y="Word", data=top_words, palette="mako")
plt.title("Top 20 Frequent Words in Job Titles")
plt.tight_layout()
plt.show()

# --- Seniority Level Distribution ---
if 'seniority_level' in df.columns:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, y="seniority_level", order=df["seniority_level"].value_counts().index, palette="Set2")
    plt.title("Seniority Level Distribution")
    plt.xlabel("Count")
    plt.ylabel("Seniority Level")
    plt.tight_layout()
    plt.show()

# --- Trend Over Time ---
monthly_trend = df['job_posted_date'].dt.to_period('M').value_counts().sort_index()
monthly_trend.index = monthly_trend.index.astype(str)

plt.figure(figsize=(14, 6))
sns.lineplot(x=monthly_trend.index, y=monthly_trend.values, marker='o', color='teal')
plt.xticks(rotation=45)
plt.title("Job Posting Trend Over Time")
plt.xlabel("Month")
plt.ylabel("Number of Jobs")
plt.tight_layout()
plt.show()

# --- Optional: Interactive Plot with Plotly ---
fig = px.line(x=monthly_trend.index, y=monthly_trend.values,
              labels={'x': 'Month', 'y': 'Jobs'}, title='Interactive Monthly Job Trend')
fig.update_traces(mode="lines+markers")
fig.show()
