In [None]:
from google.colab import files
uploaded = files.upload()


Saving fake_job_postings.csv to fake_job_postings.csv


In [None]:
# Day 5: Logistic Regression Model for Fake Job Detection

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset (preprocessed with clean_description)

df = pd.read_csv('fake_job_postings.csv')

df = df.dropna(subset=['description'])

# 1Ô∏è‚É£ Feature extraction using TF-IDF

vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(df['description'])

y = df['fraudulent']

# 2Ô∏è‚É£ Split data into train & test sets

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, random_state=42, stratify=y

)

# 3Ô∏è‚É£ Train Logistic Regression model

model = LogisticRegression(max_iter=200)

model.fit(X_train, y_train)

# 4Ô∏è‚É£ Make predictions

y_pred = model.predict(X_test)

# 5Ô∏è‚É£ Evaluate performance

print("\nAccuracy:", accuracy_score(y_test, y_pred))

print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 6Ô∏è‚É£ Check example predictions

test_samples = [

    "Work from home! Limited vacancies. Apply now.",

    "We are hiring a data scientist for our Bangalore office."

]

sample_features = vectorizer.transform(test_samples)

print("\nSample Predictions:", model.predict(sample_features))



Accuracy: 0.9658836689038032

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      3403
           1       0.98      0.30      0.46       173

    accuracy                           0.97      3576
   macro avg       0.97      0.65      0.72      3576
weighted avg       0.97      0.97      0.96      3576


Confusion Matrix:
 [[3402    1]
 [ 121   52]]

Sample Predictions: [0 0]


  Task 1: Baseline Model Evaluation

In [None]:
# --- Task 1: Baseline Model Evaluation ---

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load dataset
df = pd.read_csv('fake_job_postings.csv')
df = df.dropna(subset=['description'])

# Define features and labels
X_text = df['description']
y = df['fraudulent']

# Split data
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# --- 1Ô∏è‚É£ Bag of Words Model (CountVectorizer) ---
bow_vectorizer = CountVectorizer(max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(X_train_text)
X_test_bow = bow_vectorizer.transform(X_test_text)

bow_model = LogisticRegression(max_iter=200)
bow_model.fit(X_train_bow, y_train)

y_pred_bow = bow_model.predict(X_test_bow)

# Evaluate BoW
print("üîπ Logistic Regression (Bag of Words)")
print(classification_report(y_test, y_pred_bow, digits=3))

bow_results = {
    "Accuracy": accuracy_score(y_test, y_pred_bow),
    "Precision": precision_score(y_test, y_pred_bow),
    "Recall": recall_score(y_test, y_pred_bow),
    "F1": f1_score(y_test, y_pred_bow)
}

# --- 2Ô∏è‚É£ TF-IDF Model ---
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

tfidf_model = LogisticRegression(max_iter=200)
tfidf_model.fit(X_train_tfidf, y_train)

y_pred_tfidf = tfidf_model.predict(X_test_tfidf)

# Evaluate TF-IDF
print("\nüîπ Logistic Regression (TF-IDF)")
print(classification_report(y_test, y_pred_tfidf, digits=3))

tfidf_results = {
    "Accuracy": accuracy_score(y_test, y_pred_tfidf),
    "Precision": precision_score(y_test, y_pred_tfidf),
    "Recall": recall_score(y_test, y_pred_tfidf),
    "F1": f1_score(y_test, y_pred_tfidf)
}

# Compare Results
results_df = pd.DataFrame([bow_results, tfidf_results], index=["BoW", "TF-IDF"])
print("\nüìä Comparison Results:\n")
print(results_df)


üîπ Logistic Regression (Bag of Words)
              precision    recall  f1-score   support

           0      0.987     0.994     0.990      3403
           1      0.852     0.734     0.789       173

    accuracy                          0.981      3576
   macro avg      0.919     0.864     0.889      3576
weighted avg      0.980     0.981     0.980      3576


üîπ Logistic Regression (TF-IDF)
              precision    recall  f1-score   support

           0      0.966     1.000     0.982      3403
           1      0.981     0.301     0.460       173

    accuracy                          0.966      3576
   macro avg      0.973     0.650     0.721      3576
weighted avg      0.966     0.966     0.957      3576


üìä Comparison Results:

        Accuracy  Precision    Recall        F1
BoW     0.980984   0.852349  0.734104  0.788820
TF-IDF  0.965884   0.981132  0.300578  0.460177


In [None]:
# --- Task 2: Model Analysis ---

# Predict probabilities (P(y=1) = fake)
df['predicted_proba'] = tfidf_model.predict_proba(tfidf_vectorizer.transform(df['description']))[:, 1]

# Show top 5 most suspicious job posts
top_suspicious = df.sort_values('predicted_proba', ascending=False)[['description', 'predicted_proba']].head(5)
print("\nüö® Top 5 Most Suspicious Job Posts:\n")
print(top_suspicious)

# Optional: view a few example texts
for i, row in top_suspicious.iterrows():
    print(f"\n‚ö†Ô∏è Fake Probability: {row['predicted_proba']:.3f}")
    print(f"Description: {row['description'][:400]}...")



üö® Top 5 Most Suspicious Job Posts:

                                             description  predicted_proba
17563  URGENT Job Full Time &amp; Part Time, Cash Pay...         0.862271
8393   Weekly CASH Jobs, Hiring Part Time Workers.You...         0.861783
1857   Corporate overviewAker Solutions is a global p...         0.855028
17677  Part Time &amp; Full Time Jobs, Cash In Hands....         0.854804
17584  CASH Pay Jobs (Part Time Staff Wanted)You can ...         0.852621

‚ö†Ô∏è Fake Probability: 0.862
Description: URGENT Job Full Time &amp; Part Time, Cash Pay.You can do it all from home, in your free time, at your own place.Spend 30 minutes or 1 hours a day &amp; Get biggest cash.You can work in the morning, afternoon, or at night.Perfect for everyone then start immediately.Can earn $400 to $450 extra per day.No any experience required.Zero start-up fee, Visit here:-#URL_7ebe37f71633be1b80547d6f213cb0075a6...

‚ö†Ô∏è Fake Probability: 0.862
Description: Weekly CASH Jobs, Hiri

In [None]:
# --- Task 3: Effect of max_features in TF-IDF ---

from sklearn.metrics import accuracy_score

feature_sizes = [1000, 5000, 10000]
results = []

for size in feature_sizes:
    vectorizer = TfidfVectorizer(max_features=size)
    X_train_vec = vectorizer.fit_transform(X_train_text)
    X_test_vec = vectorizer.transform(X_test_text)

    model = LogisticRegression(max_iter=200)
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    acc = accuracy_score(y_test, y_pred)
    results.append({"max_features": size, "Accuracy": acc})

acc_df = pd.DataFrame(results)
print("\nüìà TF-IDF Feature Size vs Accuracy:\n")
print(acc_df)



üìà TF-IDF Feature Size vs Accuracy:

   max_features  Accuracy
0          1000  0.963926
1          5000  0.965884
2         10000  0.966163
