In [26]:
import torch
import pickle
import pandas as pd

from utils.utils import create_data_pipeline
from models.mohamed_ashraf.bilstm3 import BiLSTM

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [28]:
with open('utils/diacritic2id.pickle', 'rb') as f:
    diacritic2idx = pickle.load(f)

with open('utils/letter2idx.pickle', 'rb') as f:
    letter2idx = pickle.load(f)

In [29]:
vocab_size = len(letter2idx)
num_classes = len(diacritic2idx)

print(f"Vocab size: {vocab_size}, Number of classes: {num_classes}")

Vocab size: 38, Number of classes: 16


In [30]:
def pad_collate_fn(batch):
    x_batch, y_batch, mask_batch = zip(*batch)
    lengths_x = [len(x) for x in x_batch]
    x_padded = torch.nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=letter2idx['<PAD>'])
    y_padded = torch.nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=diacritic2idx['<PAD>'])
    mask_spadded = torch.nn.utils.rnn.pad_sequence(mask_batch, batch_first=True, padding_value=0)
    return x_padded, y_padded, mask_spadded, torch.tensor(lengths_x, dtype=torch.long)

In [31]:
test_dataset,  test_loader= create_data_pipeline(
    corpus_path='dataset_no_diacritics.txt', 
    letter2idx=letter2idx, 
    diacritic2idx=diacritic2idx, 
    collate_fn=pad_collate_fn,
    train=False, 
    batch_size=32
)

In [32]:
model = BiLSTM(vocab_size=vocab_size, num_classes=num_classes)

In [33]:
checkpoint = torch.load("./models/mohamed_ashraf/bilstm3.pth", map_location=device)

model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)

# print(f"Loaded checkpoint from epoch {checkpoint['epoch']}")
# print(f"Validation Loss: {checkpoint['val_loss']:.4f}")
# print(f"Validation Accuracy: {checkpoint['val_accuracy']:.4f}")

  checkpoint = torch.load("./models/mohamed_ashraf/bilstm3.pth", map_location=device)


BiLSTM(
  (embedding): Embedding(38, 256, padding_idx=13)
  (bilstm): LSTM(256, 256, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (emb_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (lstm_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc1_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (fc2): Linear(in_features=256, out_features=16, bias=True)
)

In [34]:
def create_predictions_csv(model, test_loader, letter2idx, output_file='predictions.csv'):
    model.eval()
    space_idx = letter2idx.get(' ', None)
    pad_idx = letter2idx.get('<PAD>', None)
    
    idx2letter = {v: k for k, v in letter2idx.items()}
    
    results = []
    row_id = 0
    line_number = 0
    
    with torch.no_grad():
        for batch_X, _, batch_mask, lengths in test_loader:
            batch_X = batch_X.to(device)
            
            outputs = model(batch_X, lengths)
            preds = outputs.argmax(dim=-1)
            
            batch_X_cpu = batch_X.cpu().numpy()
            preds_cpu = preds.cpu().numpy()
            batch_mask_cpu = batch_mask.cpu().numpy()
            
            for i in range(batch_X.size(0)):
                seq_len = lengths[i].item()
                
                for j in range(seq_len):
                    letter_idx = int(batch_X_cpu[i, j])
                    diacritic_idx = int(preds_cpu[i, j])
                    mask_value = int(batch_mask_cpu[i, j])
                    
                    if letter_idx == pad_idx:
                        continue
                    
                    if letter_idx == space_idx:
                        line_number += 1
                        continue
                    
                    letter = idx2letter.get(letter_idx, '<UNK>')
                    
                    case_ending = True if mask_value == 1 else False
                    
                    results.append({
                        'ID': row_id,
                        'line_number': line_number,
                        'letter': letter,
                        'case_ending': case_ending,
                        'label': diacritic_idx
                    })
                    row_id += 1
    
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    
    return df

In [35]:
predictions_df = create_predictions_csv(
    model=model,
    test_loader=test_loader,
    letter2idx=letter2idx,
    output_file='predictions.csv'
)

In [36]:
print("Current predictions.csv columns:")
print(predictions_df.columns.tolist())
print(f"\nShape: {predictions_df.shape}")
print(f"\nFirst few rows:")
print(predictions_df.head())

def create_kaggle_submission(predictions_df, output_file='submission.csv'):
    submission_df = predictions_df[['ID', 'label']].copy()
    
    submission_df.to_csv(output_file, index=False)
    
    return submission_df

submission_df = create_kaggle_submission(predictions_df, 'submission.csv')

Current predictions.csv columns:
['ID', 'line_number', 'letter', 'case_ending', 'label']

Shape: (237240, 5)

First few rows:
   ID  line_number letter  case_ending  label
0   0            0      ŸÅ        False      4
1   1            0      Ÿä         True     14
2   2            1      ÿß        False     14
3   3            1      ŸÑ        False      6
4   4            1      ŸÖ        False      0


In [37]:
def create_case_ending_submission(predictions_df, output_file='submission_ce.csv'):
    """
    Create a submission file with only rows where case_ending is True.
    
    Args:
        predictions_df: DataFrame with columns ['ID', 'line_number', 'letter', 'case_ending', 'label']
        output_file: Output CSV filename (default: 'submission_ce.csv')
    
    Returns:
        DataFrame with only case_ending=True rows, containing ['ID', 'label'] columns
    """
    # Filter rows where case_ending is True
    ce_df = predictions_df[predictions_df['case_ending'] == True].copy()
    
    # Select only ID and label columns for submission
    submission_ce_df = ce_df[['ID', 'label']].copy()
    
    # Save to CSV
    submission_ce_df.to_csv(output_file, index=False)
    
    print(f"‚úÖ Created {output_file}")
    print(f"   Total rows with case_ending=True: {len(submission_ce_df):,}")
    print(f"   Columns: {submission_ce_df.columns.tolist()}")
    print(f"\nüìä First few rows:")
    print(submission_ce_df.head())
    
    return submission_ce_df

# Create the case ending submission file
submission_ce_df = create_case_ending_submission(predictions_df, 'submission_ce.csv')

‚úÖ Created submission_ce.csv
   Total rows with case_ending=True: 56,736
   Columns: ['ID', 'label']

üìä First few rows:
    ID  label
1    1     14
7    7      4
12  12     12
18  18      4
20  20      6
