In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib

# 1. Load dataset
df = pd.read_csv("fake_job_postings.csv")

# 2. Add LinkedIn-style synthetic features
df['followers'] = np.where(df['fraudulent']==1,
                           np.random.randint(10, 500, len(df)),
                           np.random.randint(1000, 1000000, len(df)))
df['employees'] = np.where(df['fraudulent']==1,
                           np.random.randint(1, 20, len(df)),
                           np.random.randint(50, 10000, len(df)))
df['engagement'] = np.where(df['fraudulent']==1,
                            np.random.randint(0, 10, len(df)),
                            np.random.randint(20, 500, len(df)))

# 3. Prepare text column
df['text'] = df[['title','description','requirements','company_profile']].fillna('').agg(' '.join, axis=1)

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['fraudulent'], test_size=0.2, random_state=42)

# 5. Build model pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(max_iter=1000))
])

# 6. Train
model.fit(X_train, y_train)

# 7. Evaluate
print("✅ Accuracy:", model.score(X_test, y_test))

# 8. Save model (optional)
joblib.dump(model, "scam_detector.pkl")

# 9. Example prediction
sample = ["Earn $500/day working from home. No experience required. Contact us on Gmail."]
print("Prediction:", model.predict(sample)[0])  # 1=scam, 0=legit
print("Probabilities:", model.predict_proba(sample))


✅ Accuracy: 0.9706375838926175
Prediction: 1
Probabilities: [[0.29596364 0.70403636]]


In [2]:
# ===============================
# 1. Imports
# ===============================
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib

# ===============================
# 2. Load dataset
# ===============================
df = pd.read_csv("fake_job_postings.csv")

# ===============================
# 3. Add LinkedIn-style synthetic features
# ===============================
df['followers'] = np.where(df['fraudulent']==1,
                           np.random.randint(10, 500, len(df)),
                           np.random.randint(1000, 1000000, len(df)))
df['employees'] = np.where(df['fraudulent']==1,
                           np.random.randint(1, 20, len(df)),
                           np.random.randint(50, 10000, len(df)))
df['engagement'] = np.where(df['fraudulent']==1,
                            np.random.randint(0, 10, len(df)),
                            np.random.randint(20, 500, len(df)))

# ===============================
# 4. Prepare text column
# ===============================
df['text'] = df[['title','description','requirements','company_profile']].fillna('').agg(' '.join, axis=1)

# ===============================
# 5. Train/test split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['fraudulent'], test_size=0.2, random_state=42)

# ===============================
# 6. Build model pipeline
# ===============================
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(max_iter=1000))
])

# ===============================
# 7. Train model
# ===============================
model.fit(X_train, y_train)

# ===============================
# 8. Evaluate
# ===============================
print("✅ Accuracy:", model.score(X_test, y_test))

# ===============================
# 9. Save model
# ===============================
joblib.dump(model, "scam_detector.pkl")
print("📦 Model saved as scam_detector.pkl")

# ===============================
# 10. Explainability Layer
# ===============================
SCAM_KEYWORDS = ["work from home", "no experience", "quick money", 
                 "earn $", "gmail.com", "telegram", "whatsapp", 
                 "fast cash", "limited slots", "easy income"]

def predict_job(title, description, requirements, company_profile, followers, employees, engagement):
    """
    Predicts whether a job posting is scam or legit.
    
    Inputs:
        title, description, requirements, company_profile : str
        followers, employees, engagement : int
    
    Returns:
        dict with prediction, confidence, scam keywords triggered, weak company flag
    """
    # Combine text fields
    text = " ".join([str(title), str(description), str(requirements), str(company_profile)])
    
    # Model prediction
    pred = model.predict([text])[0]
    prob = model.predict_proba([text])[0][pred]
    
    # Keyword explainability
    found_keywords = [kw for kw in SCAM_KEYWORDS if re.search(kw, text.lower())]
    
    # Company credibility check
    weak_company = (followers < 500) or (employees < 10) or (engagement < 5)
    
    explanation = {
        "prediction": "scam" if pred == 1 else "legit",
        "confidence": round(float(prob), 2),
        "keywords_triggered": found_keywords,
        "weak_company": weak_company
    }
    
    return explanation

# ===============================
# 11. Test with Scam Example
# ===============================
sample1 = predict_job(
    title="Data Entry Clerk",
    description="Earn $500/day working from home. No experience required. Apply via Gmail.",
    requirements="Basic typing skills",
    company_profile="Small startup",
    followers=120,
    employees=3,
    engagement=1
)
print("\n🚨 Scam Example Test:")
print(sample1)

# ===============================
# 12. Test with Legit Example
# ===============================
sample2 = predict_job(
    title="Software Engineer",
    description="We are seeking an experienced software engineer to join our team and work on scalable backend systems.",
    requirements="3+ years of experience, knowledge of Python/Java",
    company_profile="Reputed multinational",
    followers=450000,
    employees=1200,
    engagement=200
)
print("\n✅ Legit Example Test:")
print(sample2)


✅ Accuracy: 0.9706375838926175
📦 Model saved as scam_detector.pkl

🚨 Scam Example Test:
{'prediction': 'scam', 'confidence': 0.91, 'keywords_triggered': ['no experience'], 'weak_company': True}

✅ Legit Example Test:
{'prediction': 'legit', 'confidence': 0.97, 'keywords_triggered': [], 'weak_company': False}
