# Job Posting Analysis: Detecting Fraudulent Listings through POS Tagging

## Introduction
This notebook analyzes job postings to detect potential fraudulent listings by examining Part-of-Speech (POS) tag distributions.

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import spacy
from collections import Counter
import matplotlib.pyplot as plt
from google.colab import files

## Data Loading
Upload and load the three datasets containing job postings

In [None]:
uploaded = files.upload()

In [None]:
try:
    real_jobs_df = pd.read_csv("real_jobs.csv")
    fake_jobs_df = pd.read_csv("fake_jobs.csv")
    llm_refined_df = pd.read_csv("llm_refined_fake_posts2.csv")

    print("Data loaded successfully:")
    print(f"- Real jobs: {len(real_jobs_df)} records")
    print(f"- Fake jobs: {len(fake_jobs_df)} records")
    print(f"- AI-generated fake jobs: {len(llm_refined_df)} records")

    print("\nPreview of real jobs data:")
    print(real_jobs_df.head())

except Exception as e:
    print(f"Error loading data: {e}")

## POS Tagging Analysis

### POS Tag Counting Function
This function counts POS tags in text using spaCy

In [None]:
def get_pos_counts(texts):
    """Count POS tags in a list of texts using spaCy."""
    pos_counter = Counter()
    valid_texts = [str(text) for text in texts if pd.notna(text)]

    for doc in nlp.pipe(valid_texts, disable=["ner", "parser"], batch_size=50):
        pos_counter.update(
            [token.pos_ for token in doc if not token.is_punct and not token.is_space]
        )
    return pos_counter


def normalize(counter):
    """Normalize counts to frequencies."""
    total = sum(counter.values())
    return {pos: count / total for pos, count in counter.items()}

### Extract and Analyze Descriptions
Process the job descriptions from each dataset

In [None]:
# Get descriptions from each dataset
real_texts = real_jobs_df["description"].dropna().tolist()
fake_texts = fake_jobs_df["description"].dropna().tolist()
ai_texts = llm_refined_df["description"].dropna().tolist()

# Calculate POS tag frequencies
real_pos_counts = get_pos_counts(real_texts)
fake_pos_counts = get_pos_counts(fake_texts)
ai_pos_counts = get_pos_counts(ai_texts)

# Normalize counts
real_pos_norm = normalize(real_pos_counts)
fake_pos_norm = normalize(fake_pos_counts)
ai_pos_norm = normalize(ai_pos_counts)

# Create comparison DataFrame
pos_data = {"Real": real_pos_norm, "Fake": fake_pos_norm, "AI-Fake": ai_pos_norm}
df_pos = pd.DataFrame(pos_data).fillna(0).T

print("\nPOS Tag Frequencies:")
print(df_pos)

## Visualization
Create visual comparison of POS tag distributions

In [None]:
def plot_pos_comparison(pos_df):
    """Plot normalized POS tag distributions for comparison."""
    plt.figure(figsize=(12, 6))
    pos_df.plot(kind="bar", stacked=False)
    plt.title("POS Tag Distribution Comparison Across Job Types")
    plt.xlabel("Job Type")
    plt.ylabel("Normalized Frequency")
    plt.xticks(rotation=0)
    plt.legend(title="POS Tags", bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    plt.savefig("pos_tag_comparison.png", bbox_inches="tight")
    plt.show()


# Generate visualization
plot_pos_comparison(df_pos)

In [None]:
# Save final results
df_pos.to_csv("pos_tag_results.csv")
print("Analysis complete. Results saved to pos_tag_results.csv")