In [None]:
# ========== CELL 0: Intuition ==========

'''
graph TD
  A[Original Text] --> B(AMR Parsing)
  B --> C{AMR Graph}
  C --> D[Graph Linearization]
  D --> E[Graph Tokenization]
  E --> F[BERT Embedding]
  F --> G[Model Input]
  '''

In [None]:
# ========== CELL 1: AMR Parsing & Preprocessing ==========
import amrlib
import pandas as pd

# Load AMR parser
stog = amrlib.load_stog_model(device='cpu')

# Load CSV data
df = pd.read_csv('/data/path.csv')  # Replace with your data path
original_texts = df['text'].tolist()
summaries = df['summary'].tolist()

# Convert text to AMR graphs
print("Parsing AMR graphs...")
amr_graphs = stog.parse_sents(original_texts)

# Parse graphs to linearized format
print("Linearizing graphs...")
graph_strings = [g.to_penman(remove_wiki=True) for g in amr_graphs]

In [None]:
# ========== CELL 2: AS2SP Model (Exact Paper Specs) ==========
class AS2SP(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder (BiLSTM)
        self.enc_embed = nn.Embedding(VOCAB_SIZE, 768) # Differs based on data
        self.encoder = nn.LSTM(768, 256//2, 
                              num_layers=2,
                              bidirectional=True,
                              dropout=0.3)
        
        # Decoder (LSTM)
        self.dec_embed = nn.Embedding(VOCAB_SIZE, 768)
        self.decoder = nn.LSTM(768, 512, dropout=0.3)
        
        # Attention (with coverage)
        self.W_h = nn.Linear(256, 512)
        self.W_s = nn.Linear(512, 512)
        self.W_cov = nn.Linear(1, 512)
        self.v = nn.Linear(512, 1)
        
        # Pointer-Generator
        self.p_gen = nn.Linear(256 + 512 + 768, 1)
        self.fc = nn.Linear(512, VOCAB_SIZE) 
        self.dropout = nn.Dropout(0.3)

    def forward(self, src_graph, trg_text, cov_vec=None):
        # Encoder processing AMR graph
        enc_embedded = self.dropout(self.enc_embed(src_graph))
        enc_out, (h_n, c_n) = self.encoder(enc_embedded)
        
        # Merge bidirectional states
        h_n = torch.cat((h_n[-2], h_n[-1]), dim=1).unsqueeze(0)
        c_n = torch.cat((c_n[-2], c_n[-1]), dim=1).unsqueeze(0)
        
        # Decoder processing target text
        dec_embedded = self.dropout(self.dec_embed(trg_text))
        dec_out, (h_d, _) = self.decoder(dec_embedded, (h_n, c_n))
        
        # Attention with coverage
        attn_energy = torch.tanh(
            self.W_h(enc_out) + 
            self.W_s(h_d[-1].unsqueeze(1)) + 
            (self.W_cov(cov_vec.unsqueeze(-1)) if cov_vec is not None else 0)
        )
        attn_weights = F.softmax(self.v(attn_energy).squeeze(2), dim=1)
        
        # Context vector
        context = torch.bmm(attn_weights.unsqueeze(1), enc_out).squeeze(1)
        
        # Pointer-Generator
        p_gen = torch.sigmoid(self.p_gen(
            torch.cat([context, h_d[-1], dec_embedded.squeeze(1)], dim=1)
        ))
        
        # Final output
        output = self.fc(dec_out.squeeze(1))
        return output, attn_weights, p_gen

In [None]:
# ========== CELL 3: Verification Checklist ==========

'''
# AMR Processing:

Text → AMR graphs using amrlib

Graph linearization to penman format

Graph tokenization as model input

# Model Architecture:

2-layer bidirectional LSTM encoder

1-layer LSTM decoder

Attention with coverage mechanism

Pointer-generator network

Dropout (0.3) and gradient clipping (2.0)

# Training Parameters:

Batch size 64/32

Learning rate 0.001

Adam optimizer

15 epochs
'''