In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import BertModel, BertTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler
import time
import joblib  # Import joblib for saving models
from google.colab import files

# Function to process data in batches
def extract_features_in_batches(descriptions, tokenizer, model, device, batch_size=32, max_len=128):
    features = []
    num_batches = len(descriptions) // batch_size + (1 if len(descriptions) % batch_size > 0 else 0)

    for i in range(num_batches):
        batch_descs = descriptions[i * batch_size: (i + 1) * batch_size]
        batch_features = extract_bert_features(batch_descs, tokenizer, model, device, max_len)
        features.extend(batch_features)

    return features

def extract_bert_features(texts, tokenizer, model, device, max_len=128):
    tokens = tokenizer(
        texts.tolist(),
        max_length=max_len,
        padding=True,
        truncation=True,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    # Use [CLS] token hidden state as the sentence embedding
    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return cls_embeddings


In [5]:
# Load your dataset from runtime
print("Loading dataset...")
start_time = time.time()
df = pd.read_csv('/content/job_train.csv')  # Ensure the file is in the Colab runtime
print(f"Dataset loaded in {time.time() - start_time:.2f} seconds")

# Check for missing values in 'description' column
print("Checking for missing values...")
print(df['description'].isnull().sum())

# Handle missing values in 'description' column
df['description'].fillna('missing', inplace=True)  # Option to fill NaN with 'missing'


Loading dataset...
Dataset loaded in 0.20 seconds
Checking for missing values...
1


In [6]:
# Split dataset into training and test sets (5% for test set)
print("Splitting dataset...")
start_time = time.time()
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"Dataset split completed in {time.time() - start_time:.2f} seconds")


Splitting dataset...
Dataset split completed in 0.00 seconds


In [7]:
# Initialize the BertTokenizer and BERT model
print("Initializing BERT tokenizer and model...")
start_time = time.time()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()  # Set BERT to evaluation mode
device = torch.device('cpu')  # Use CPU
bert_model = bert_model.to(device)
print(f"Tokenizer and model initialization completed in {time.time() - start_time:.2f} seconds")


Initializing BERT tokenizer and model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Tokenizer and model initialization completed in 7.93 seconds


In [8]:
# Extract BERT features in batches
print("Extracting BERT features...")
start_time = time.time()
train_bert_features = extract_features_in_batches(train_df['description'].to_numpy(), tokenizer, bert_model, device)
test_bert_features = extract_features_in_batches(test_df['description'].to_numpy(), tokenizer, bert_model, device)
print(f"BERT feature extraction completed in {time.time() - start_time:.2f} seconds")


Extracting BERT features...
BERT feature extraction completed in 2074.82 seconds


In [9]:
# Extract additional binary features
print("Extracting additional features...")
start_time = time.time()
train_additional_features = train_df[['has_company_logo', 'has_questions', 'telecommuting']].to_numpy()
test_additional_features = test_df[['has_company_logo', 'has_questions', 'telecommuting']].to_numpy()
print(f"Additional feature extraction completed in {time.time() - start_time:.2f} seconds")


Extracting additional features...
Additional feature extraction completed in 0.00 seconds


In [10]:
# Concatenate BERT embeddings with additional features
print("Concatenating BERT embeddings with additional features...")
start_time = time.time()
train_features = np.array([np.concatenate([bert_feat, add_feat]) for bert_feat, add_feat in zip(train_bert_features, train_additional_features)])
test_features = np.array([np.concatenate([bert_feat, add_feat]) for bert_feat, add_feat in zip(test_bert_features, test_additional_features)])
print(f"Feature concatenation completed in {time.time() - start_time:.2f} seconds")

# Scale the features using MinMaxScaler to ensure non-negative values
print("Scaling features with MinMaxScaler...")
start_time = time.time()
scaler = MinMaxScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)
print(f"Feature scaling completed in {time.time() - start_time:.2f} seconds")


Concatenating BERT embeddings with additional features...
Feature concatenation completed in 0.05 seconds
Scaling features with MinMaxScaler...
Feature scaling completed in 0.05 seconds


In [11]:
# Train the KNN classifier with 3 neighbors
print("Training KNN classifier with 3 neighbors...")
start_time = time.time()
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(train_features, train_df['fraudulent'])
print(f"Training completed in {time.time() - start_time:.2f} seconds")


Training KNN classifier with 3 neighbors...
Training completed in 0.01 seconds


In [13]:
# Make predictions on the test set
print("Making predictions on the test set...")
start_time = time.time()
test_predictions = knn_model.predict(test_features)
print(f"Prediction completed in {time.time() - start_time:.2f} seconds")

# Evaluate the model
print("Evaluating the model...")
start_time = time.time()
accuracy = accuracy_score(test_df['fraudulent'], test_predictions)
report = classification_report(test_df['fraudulent'], test_predictions)
print(f"Evaluation completed in {time.time() - start_time:.2f} seconds")

print(f'Test Accuracy: {accuracy}')
print(f'Test Report:\n{report}')


Making predictions on the test set...
Prediction completed in 0.32 seconds
Evaluating the model...
Evaluation completed in 0.01 seconds
Test Accuracy: 0.9653243847874721
Test Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       839
           1       0.83      0.55      0.66        55

    accuracy                           0.97       894
   macro avg       0.90      0.77      0.82       894
weighted avg       0.96      0.97      0.96       894



In [14]:
# Save the trained model and scaler
print("Saving model and scaler...")
start_time = time.time()
joblib.dump(knn_model, '/content/knn_model.pkl')
joblib.dump(scaler, '/content/standard_scaler.pkl')
print(f"Model and scaler saved in {time.time() - start_time:.2f} seconds")

print("Download the files:")
files.download('/content/knn_model.pkl')
files.download('/content/standard_scaler.pkl')


Saving model and scaler...
Model and scaler saved in 0.05 seconds
Download the files:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>