In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertModel, BertTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler
import time
import joblib  # Import joblib for saving models
from google.colab import files

# Function to process data in batches
def extract_features_in_batches(descriptions, tokenizer, model, device, batch_size=32, max_len=128):
    features = []
    num_batches = len(descriptions) // batch_size + (1 if len(descriptions) % batch_size > 0 else 0)

    for i in range(num_batches):
        batch_descs = descriptions[i * batch_size: (i + 1) * batch_size]
        batch_features = extract_bert_features(batch_descs, tokenizer, model, device, max_len)
        features.extend(batch_features)

    return features

def extract_bert_features(texts, tokenizer, model, device, max_len=128):
    tokens = tokenizer(
        texts.tolist(),
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    # Use [CLS] token hidden state as the sentence embedding
    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return cls_embeddings

# Load your dataset from runtime
print("Loading dataset...")
start_time = time.time()
df = pd.read_csv('/content/job_train.csv')  # Ensure the file is in the Colab runtime
print(f"Dataset loaded in {time.time() - start_time:.2f} seconds")

# Check for missing values in 'description' column
print("Checking for missing values...")
print(df['description'].isnull().sum())

# Handle missing values in 'description' column
df['description'].fillna('missing', inplace=True)  # Option to fill NaN with 'missing'

# Split dataset into training and test sets (5% for test set)
print("Splitting dataset...")
start_time = time.time()
train_df, test_df = train_test_split(df, test_size=0.05, random_state=42)
print(f"Dataset split completed in {time.time() - start_time:.2f} seconds")

# Initialize the BertTokenizer and BERT model
print("Initializing BERT tokenizer and model...")
start_time = time.time()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()  # Set BERT to evaluation mode
device = torch.device('cpu')  # Use CPU
bert_model = bert_model.to(device)
print(f"Tokenizer and model initialization completed in {time.time() - start_time:.2f} seconds")

# Extract BERT features in batches
print("Extracting BERT features...")
start_time = time.time()
train_bert_features = extract_features_in_batches(train_df['description'].to_numpy(), tokenizer, bert_model, device)
test_bert_features = extract_features_in_batches(test_df['description'].to_numpy(), tokenizer, bert_model, device)
print(f"BERT feature extraction completed in {time.time() - start_time:.2f} seconds")

# Extract additional binary features
print("Extracting additional features...")
start_time = time.time()
train_additional_features = train_df[['has_company_logo', 'has_questions', 'telecommuting']].to_numpy()
test_additional_features = test_df[['has_company_logo', 'has_questions', 'telecommuting']].to_numpy()
print(f"Additional feature extraction completed in {time.time() - start_time:.2f} seconds")

# Concatenate BERT embeddings with additional features
print("Concatenating BERT embeddings with additional features...")
start_time = time.time()
train_features = np.array([np.concatenate([bert_feat, add_feat]) for bert_feat, add_feat in zip(train_bert_features, train_additional_features)])
test_features = np.array([np.concatenate([bert_feat, add_feat]) for bert_feat, add_feat in zip(test_bert_features, test_additional_features)])
print(f"Feature concatenation completed in {time.time() - start_time:.2f} seconds")

# Scale the features using MinMaxScaler to ensure non-negative values
print("Scaling features with MinMaxScaler...")
start_time = time.time()
scaler = MinMaxScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)
print(f"Feature scaling completed in {time.time() - start_time:.2f} seconds")

# Train the Naive Bayes classifier
print("Training Naive Bayes classifier...")
start_time = time.time()
nb_model = MultinomialNB()
nb_model.fit(train_features, train_df['fraudulent'])
print(f"Training completed in {time.time() - start_time:.2f} seconds")

# Make predictions on the test set
print("Making predictions on the test set...")
start_time = time.time()
test_predictions = nb_model.predict(test_features)
print(f"Prediction completed in {time.time() - start_time:.2f} seconds")

# Evaluate the model
print("Evaluating the model...")
start_time = time.time()
accuracy = accuracy_score(test_df['fraudulent'], test_predictions)
report = classification_report(test_df['fraudulent'], test_predictions)
print(f"Evaluation completed in {time.time() - start_time:.2f} seconds")

print(f'Test Accuracy: {accuracy}')
print(f'Test Report:\n{report}')

# Save the trained models and scaler
print("Saving model and scaler...")
start_time = time.time()
joblib.dump(nb_model, '/content/naive_bayes_model.pkl')
joblib.dump(scaler, '/content/standard_scaler.pkl')
print(f"Model and scaler saved in {time.time() - start_time:.2f} seconds")

print("Download the files:")
files.download('/content/naive_bayes_model.pkl')
files.download('/content/standard_scaler.pkl')


Loading dataset...
Dataset loaded in 0.36 seconds
Checking for missing values...
1
Splitting dataset...
Dataset split completed in 0.00 seconds
Initializing BERT tokenizer and model...
Tokenizer and model initialization completed in 1.57 seconds
Extracting BERT features...
BERT feature extraction completed in 3478.97 seconds
Extracting additional features...
Additional feature extraction completed in 0.01 seconds
Concatenating BERT embeddings with additional features...
Feature concatenation completed in 0.10 seconds
Scaling features with MinMaxScaler...
Feature scaling completed in 0.09 seconds
Training Naive Bayes classifier...
Training completed in 0.06 seconds
Making predictions on the test set...
Prediction completed in 0.00 seconds
Evaluating the model...
Evaluation completed in 0.02 seconds
Test Accuracy: 0.9105145413870246
Test Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       418
           1       0.00      0.00  

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>