In [1]:
import pandas as pd

In [6]:
nos_ssa = pd.read_csv("../docs/nos-ofqual/NOS_Data_w_SSA_Industry.csv")
nos_ssa.columns

Index(['nos_id', 'NOS_Industry', 'title', 'text', 'OFQUAL_SSA',
       'Additional OFQUAL_SSA', 'Additional_OFQUAL_SSA.2',
       'Additional_OFQUAL_SSA.3', 'Additional_OFQUAL_SSA.4',
       'Additional_OFQUAL_SSA.5', 'Additional_OFQUAL_SSA.6'],
      dtype='object')

In [7]:
SSA_unique_nos = nos_ssa['OFQUAL_SSA'].unique()
print(SSA_unique_nos)
print(len(SSA_unique_nos))

['14.1 Foundations for learning and life'
 '10.2 Archaeology and archaeological sciences'
 '7.1 Retailing and wholesaling' '1.4 Public services' nan
 '5.2 Building and construction' '15.3 Business management'
 '11.2 Sociology and social policy' '15.1 Accounting and finance'
 '4.1 Engineering' '3.1 Environmental conservation'
 '4.2 Manufacturing technologies' '9.2 Crafts, creative arts and design'
 '9.1 Performing arts' '3.4 Environmental conservation'
 '7.2 Warehousing and distribution' '7.4 Hospitality and catering'
 '3.3 Animal care and veterinary science' '3.2 Horticulture and forestry'
 '12.2 Other languages literature and culture' '15.4 Marketing and Sales'
 '1.3 Health and social care' '9.3 Media and communication'
 '13.1 Teaching and Lecturing' '2.1 Science'
 '4.3 Transportation operations and maintenance'
 '5.3 Urban, rural and regional planning'
 '6.1 Digital technology (practitioners)'
 '18.1 Publishing and information services'
 '8.1 Sport leisure and recreation' '3.1 Agricu

In [8]:
len_nos = len(nos_ssa)
ssa_nos_count = len(nos_ssa['OFQUAL_SSA'].dropna())
print(ssa_nos_count)
print(len_nos)


9205
16052


In [9]:
percentage_nos = ((ssa_nos_count/len_nos) * 100) 
print(percentage_nos,"% of NOS have SSA")

57.34487914278594 % of NOS have SSA


In [10]:
nos_ssa_list = nos_ssa[['nos_id', 'NOS_Industry', 'title', 'text', 'OFQUAL_SSA']].dropna()
print(nos_ssa_list)
nos_ssa_list.to_csv('nos_ssa_list_57%.csv', index=False)

         nos_id                     NOS_Industry  \
0        CFAUE3         Understanding Enterprise   
1      CCSAPAA2          Archaeological Practice   
2        CFAUE6         Understanding Enterprise   
3       IMIHR15                   Vehicle Rental   
4        CFAUE7         Understanding Enterprise   
...         ...                              ...   
16044     SFHM3  Breast Screening and Assessment   
16045     SFHM9  Breast Screening and Assessment   
16048      REC2                      Recruitment   
16049    CFAUE5         Understanding Enterprise   
16050      REC1                      Recruitment   

                                                   title  \
0            Knowing the market and satisfying customers   
1                                    Commission Research   
2                                   Planning for Success   
3      Receive goods, equipment or vehicles for hire ...   
4                                 Managing Money Matters   
...            

In [11]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
import torch

# 1. Data Preparation
def prepare_text_features(nos_data):
    # Fill NaN values with empty strings
    title = nos_data['title'].fillna('')
    text = nos_data['text'].fillna('')
    industry = nos_data['NOS_Industry'].fillna('')
    
    # Combine relevant text fields
    return title + ' ' + text + ' ' + industry

# 2. Create the classification pipeline
def create_classifier():
    return Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 2),
            stop_words='english'
        )),
        ('clf', OneVsRestClassifier(LogisticRegression(
            max_iter=1000,
            class_weight='balanced'
        )))
    ])

# 3. Training and Evaluation
def train_and_evaluate(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train the model
    classifier = create_classifier()
    classifier.fit(X_train, y_train)
    
    # Evaluate
    train_score = classifier.score(X_train, y_train)
    test_score = classifier.score(X_test, y_test)
    
    print(f"Training accuracy: {train_score:.3f}")
    print(f"Testing accuracy: {test_score:.3f}")
    
    return classifier

# 4. Prediction with confidence scores
def predict_with_confidence(classifier, text):
    # Handle NaN values in prediction data
    if isinstance(text, pd.Series):
        text = text.fillna('')
    
    # Get probability scores
    proba = classifier.predict_proba(text)
    predictions = classifier.predict(text)
    
    # Get confidence scores for each prediction
    confidence_scores = np.max(proba, axis=1)
    
    return predictions, confidence_scores

In [17]:
# Load and prepare training data
import pandas as pd

# Load training data
nos_ssa_list

# Prepare features and labels
X = prepare_text_features(nos_ssa_list)
y = nos_ssa_list['OFQUAL_SSA']

# Train the model
classifier = train_and_evaluate(X, y)

# Load prediction data
pred_data = pd.read_csv('../docs/nos-ofqual/NOS_Data_w_SSA_Industry.csv')

# Prepare prediction features
X_pred = prepare_text_features(pred_data)

# Make predictions with confidence scores
predictions, confidence_scores = predict_with_confidence(classifier, X_pred)

# Add predictions and confidence scores to prediction dataframe
pred_data['predicted_SSA'] = predictions
pred_data['confidence_score'] = confidence_scores

# Display sample of predictions
print("\nSample predictions:")
print(pred_data[['nos_id', 'NOS_Industry', 'OFQUAL_SSA', 'predicted_SSA', 'confidence_score']].head())

# Save predictions
pred_data.to_csv('nos_ssa_predictions.csv', index=False)
print("\nPredictions saved to nos_ssa_predictions.csv")


Training accuracy: 0.850
Testing accuracy: 0.785

Sample predictions:
     nos_id              NOS_Industry  \
0    CFAUE3  Understanding Enterprise   
1  CCSAPAA2   Archaeological Practice   
2    CFAUE6  Understanding Enterprise   
3   IMIHR15            Vehicle Rental   
4    CFAUE7  Understanding Enterprise   

                                     OFQUAL_SSA  \
0        14.1 Foundations for learning and life   
1  10.2 Archaeology and archaeological sciences   
2        14.1 Foundations for learning and life   
3                 7.1 Retailing and wholesaling   
4        14.1 Foundations for learning and life   

                                  predicted_SSA  confidence_score  
0        14.1 Foundations for learning and life          0.225326  
1  10.2 Archaeology and archaeological sciences          0.364586  
2        14.1 Foundations for learning and life          0.356810  
3                 7.1 Retailing and wholesaling          0.301787  
4        14.1 Foundations for learni

In [36]:
from sklearn.metrics import f1_score

# Create classifier with SGD
# Modified feature engineering
classifier = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=150000,  # 3x increase from 50k
        ngram_range=(1, 4),   # Carefully expanded
        stop_words='english',
        sublinear_tf=True,
        min_df=2,            # Slightly more permissive
        max_df=0.6,          # Slightly more restrictive
        analyzer='word',     # New - explicit analyzer
        norm='l2'            # New - better normalization
    )),
    ('clf', OneVsRestClassifier(SGDClassifier(
        loss='log_loss',
        penalty='elasticnet',
        alpha=1e-5,          # Reduced regularization
        learning_rate='adaptive',
        eta0=0.2,           # Higher initial rate
        power_t=0.25,
        early_stopping=True,
        validation_fraction=0.25,  # More validation data
        n_iter_no_change=7, # More patience
        random_state=42,
        class_weight='balanced'  # New - handle class imbalance
    )))
])


def train_and_evaluate_multiple_epochs(classifier, X, y, n_epochs=5, n_splits=5):
    # Initialize KFold cross-validator
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Track metrics across epochs
    epoch_metrics = []
    best_accuracy = 0
    
    # Split into train/test once to have consistent test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Initial fit to transform the data
    classifier.fit(X_train, y_train)
    
    for epoch in range(n_epochs):
        # Adjust learning rate dynamically
        new_eta = 0.1 / (1 + 0.2 * epoch)  # Decaying learning rate
        for estimator in classifier.named_steps['clf'].estimators_:
            estimator.eta0 = new_eta
            
        # Partial fit for incremental learning
        classifier.named_steps['clf'].partial_fit(
            classifier.named_steps['tfidf'].transform(X_train),
            y_train,
            classes=np.unique(y)
        )
        
        # Evaluate
        train_score = classifier.score(X_train, y_train)
        test_score = classifier.score(X_test, y_test)
        
        # Get predictions for F1 score calculation
        test_predictions = classifier.predict(X_test)
        
        # Cross validation score
        cv_scores = cross_val_score(classifier, X_train, y_train, cv=kf)
        
        print(f"\nEpoch {epoch + 1}:")
        print(f"Training accuracy: {train_score:.3f}")
        print(f"Testing accuracy: {test_score:.3f}")
        print(f"Cross-validation mean: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
        print(f"F-1 Score: {f1_score(y_test, test_predictions, average='weighted'):.3f}")  # Modified this line
        
        # Track metrics
        epoch_metrics.append({
            'epoch': epoch + 1,
            'train_score': train_score,
            'test_score': test_score,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std()
        })
        
        # Update best state if improved
        if test_score > best_accuracy:
            best_accuracy = test_score
            best_state = {
                'tfidf': classifier.named_steps['tfidf'],
                'clf': classifier.named_steps['clf']
            }
    
    # Print summary
    print("\nTraining Summary:")
    for metric in epoch_metrics:
        print(f"Epoch {metric['epoch']}: "
              f"Train={metric['train_score']:.3f}, "
              f"Test={metric['test_score']:.3f}, "
              f"CV={metric['cv_mean']:.3f}±{metric['cv_std']*2:.3f}")
    
    # Restore best state
    classifier.named_steps['tfidf'] = best_state['tfidf']
    classifier.named_steps['clf'] = best_state['clf']
    
    return classifier

# Train the classifier
classifier = train_and_evaluate_multiple_epochs(classifier, X, y, n_epochs=20)

# Make predictions
X_pred = prepare_text_features(pred_data)
predictions, confidence_scores = predict_with_confidence(classifier, X_pred)

# Add predictions and confidence scores
pred_data['predicted_SSA'] = predictions
pred_data['confidence_score'] = confidence_scores

# Display sample predictions
print("\nSample predictions with best model:")
sample_df = pred_data[['nos_id', 'NOS_Industry', 'OFQUAL_SSA', 'predicted_SSA', 'confidence_score']].head()
print(sample_df)

# Save predictions
pred_data.to_csv('nos_ssa_predictions.csv', index=False)
print("\nPredictions saved to nos_ssa_predictions.csv")


Epoch 1:
Training accuracy: 0.863
Testing accuracy: 0.772
Cross-validation mean: 0.777 (+/- 0.018)
F-1 Score: 0.770

Epoch 2:
Training accuracy: 0.863
Testing accuracy: 0.772
Cross-validation mean: 0.777 (+/- 0.018)
F-1 Score: 0.770

Epoch 3:
Training accuracy: 0.863
Testing accuracy: 0.772
Cross-validation mean: 0.777 (+/- 0.018)
F-1 Score: 0.771

Epoch 4:
Training accuracy: 0.863
Testing accuracy: 0.772
Cross-validation mean: 0.777 (+/- 0.018)
F-1 Score: 0.771

Epoch 5:
Training accuracy: 0.863
Testing accuracy: 0.771
Cross-validation mean: 0.777 (+/- 0.018)
F-1 Score: 0.770

Epoch 6:
Training accuracy: 0.864
Testing accuracy: 0.771
Cross-validation mean: 0.777 (+/- 0.018)
F-1 Score: 0.769

Epoch 7:
Training accuracy: 0.864
Testing accuracy: 0.771
Cross-validation mean: 0.777 (+/- 0.018)
F-1 Score: 0.769

Epoch 8:
Training accuracy: 0.864
Testing accuracy: 0.770
Cross-validation mean: 0.777 (+/- 0.018)
F-1 Score: 0.768

Epoch 9:
Training accuracy: 0.864
Testing accuracy: 0.769
Cross

In [29]:
# Check MPS memory usage
print(f"MPS memory allocated: {torch.mps.current_allocated_memory() / 1024**3:.2f} GB")
print(f"MPS max memory allocated: {torch.mps.recommended_max_memory() / 1024**3:.2f} GB")

# Clear GPU memory
import torch
torch.mps.empty_cache()  # Clear MPS (Apple GPU) memory

# Also clear CUDA memory if you were using it
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Force Python garbage collection
import gc
gc.collect()

# Check MPS memory usage
print(f"MPS memory allocated: {torch.mps.current_allocated_memory() / 1024**3:.2f} GB")
print(f"MPS max memory allocated: {torch.mps.recommended_max_memory() / 1024**3:.2f} GB")

MPS memory allocated: 5.22 GB
MPS max memory allocated: 10.67 GB
MPS memory allocated: 0.04 GB
MPS max memory allocated: 10.67 GB


In [31]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

# Check for MPS availability
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Create PyTorch model
class SSAClassifier(torch.nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.fc = torch.nn.Linear(input_size, num_classes)
        torch.nn.init.xavier_uniform_(self.fc.weight)
        
    def forward(self, x):
        return torch.sigmoid(self.fc(x))

# 3. Modified training function
def train_pytorch_model(classifier, X, y, n_epochs=50):
    # Convert labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    y_tensor = torch.tensor(y_encoded, dtype=torch.long).to(device)
    
    # Get tensor features
    X_tensor = prepare_tensor_features(X)
    
    # Create model
    input_size = X_tensor.shape[1]
    num_classes = len(le.classes_)
    model = SSAClassifier(input_size, num_classes).to(device)
    
    # Training setup
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    
    # Create DataLoader
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=64, shuffle=True,)
    
    best_acc = 0
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        
        for batch_x, batch_y in loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        # Validation
        model.eval()
        with torch.no_grad():
            outputs = model(X_tensor)
            _, predicted = torch.max(outputs, 1)
            acc = (predicted == y_tensor).float().mean()
            
        print(f"Epoch {epoch+1}: Loss={total_loss/len(loader):.4f}, Acc={acc:.4f}")
        
        # Save best model
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), "best_model.pth")
    
    # Load best model
    model.load_state_dict(torch.load("best_model.pth"))
    return model, le


# Modified data preparation functions
def prepare_text_features(nos_data):
    """Handle both DataFrames and preprocessed Series"""
    if isinstance(nos_data, pd.DataFrame):
        # For raw DataFrames
        return nos_data['title'].fillna('') + ' ' + \
            nos_data['text'].fillna('') + ' ' + \
            nos_data['NOS_Industry'].fillna('')
    # For already processed text
    return nos_data.fillna('')

def prepare_tensor_features(text_data):
    """Handle preprocessed text directly"""
    # Vectorize using existing TF-IDF
    X_tfidf = classifier.named_steps['tfidf'].transform(text_data)
    
    # Convert to PyTorch tensors
    return torch.tensor(X_tfidf.toarray(), dtype=torch.float32).to(device)

# Update the PyTorch training call
model, label_encoder = train_pytorch_model(classifier, X, y, n_epochs=22)


# Usage:
# 1. First train the TF-IDF vectorizer
classifier.fit(X, y)

# 2. Train PyTorch model
model, label_encoder = train_pytorch_model(classifier, X, y, n_epochs=22)

# 3. Make predictions
def pytorch_predict(model, le, text_data):
    with torch.no_grad():
        X_tensor = prepare_tensor_features(text_data)
        outputs = model(X_tensor)
        _, predictions = torch.max(outputs, 1)
    return le.inverse_transform(predictions.cpu().numpy())

pred_data['predicted_SSA'] = pytorch_predict(model, label_encoder, X_pred)

Using device: mps
Epoch 1: Loss=3.5291, Acc=0.7218
Epoch 2: Loss=3.3499, Acc=0.0291
Epoch 3: Loss=3.2126, Acc=0.7715
Epoch 4: Loss=3.1122, Acc=0.7897
Epoch 5: Loss=3.0377, Acc=0.8035
Epoch 6: Loss=2.9807, Acc=0.0285
Epoch 7: Loss=2.9360, Acc=0.0282
Epoch 8: Loss=2.9003, Acc=0.8276
Epoch 9: Loss=2.8712, Acc=0.8334
Epoch 10: Loss=2.8471, Acc=0.0274
Epoch 11: Loss=2.8271, Acc=0.0275
Epoch 12: Loss=2.8102, Acc=0.0281
Epoch 13: Loss=2.7957, Acc=0.0279
Epoch 14: Loss=2.7834, Acc=0.0275
Epoch 15: Loss=2.7726, Acc=0.0268
Epoch 16: Loss=2.7633, Acc=0.8516
Epoch 17: Loss=2.7551, Acc=0.8518
Epoch 18: Loss=2.7479, Acc=0.8527
Epoch 19: Loss=2.7416, Acc=0.8530
Epoch 20: Loss=2.7358, Acc=0.8532
Epoch 21: Loss=2.7308, Acc=0.8535
Epoch 22: Loss=2.7263, Acc=0.8537
Epoch 1: Loss=3.5294, Acc=0.7216
Epoch 2: Loss=3.3508, Acc=0.7511
Epoch 3: Loss=3.2135, Acc=0.7736
Epoch 4: Loss=3.1129, Acc=0.7909
Epoch 5: Loss=3.0382, Acc=0.8038
Epoch 6: Loss=2.9812, Acc=0.8122
Epoch 7: Loss=2.9365, Acc=0.8206
Epoch 8: Los

RuntimeError: Invalid buffer size: 8.97 GB

In [None]:
# Load and prepare the data for prediction
def predict_ssa(text_input, model, classifier, label_encoder):
    # Prepare the text input
    if isinstance(text_input, str):
        text_input = pd.read_csv('../docs/nos-ofqual/NOS_Data_w_SSA_Industry.csv')
    
    # Preprocess the text using the same pipeline steps
    processed_text = prepare_text_features(text_input)
    
    # Convert to tensor features
    X_tensor = prepare_tensor_features(processed_text)
    
    # Get model predictions
    model.eval()
    with torch.no_grad():
        outputs = model(X_tensor)
        _, predicted = torch.max(outputs, 1)
        
    # Convert predictions back to labels
    predicted_labels = label_encoder.inverse_transform(predicted.cpu().numpy())
    
    return predicted_labels

# Example usage:
# sample_text = "Sample NOS text to classify"
# predictions = predict_ssa(sample_text, model, classifier, label_encoder)
# print(f"Predicted SSA: {predictions[0]}")
