In [17]:
!pip install streamlit ngrok nltk torch

Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting ngrok
  Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manyli

In [37]:
%%writefile app.py
import streamlit as st
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import nltk
from nltk.tree import Tree
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# Download the Treebank corpus
nltk.download("treebank")

# Load the dataset properly
treebank = list(nltk.corpus.treebank.parsed_sents())  # Convert to list

def tree_to_tuples(tree):
    tuples = []
    if isinstance(tree, Tree):
        for subtree in tree:
            tuples.append((tree.label(), subtree.label() if isinstance(subtree, Tree) else subtree))
            tuples.extend(tree_to_tuples(subtree))
    return tuples
class TreeDataset(Dataset):
    def __init__(self, trees):
        self.data = []
        for tree in trees:
            self.data.extend(tree_to_tuples(tree))
        self.labels = list(set([x[0] for x in self.data] + [x[1] for x in self.data if isinstance(x[1], str)]))
        self.label2idx = {label: idx for idx, label in enumerate(self.labels)}
        self.idx2label = {idx: label for label, idx in self.label2idx.items()}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        parent, child = self.data[idx]
        parent_idx = self.label2idx[parent]
        child_idx = self.label2idx.get(child, len(self.label2idx) - 1)  # Ensure index is valid
        return torch.tensor(parent_idx), torch.tensor(child_idx)
class RNNParser(nn.Module):
    def __init__(self, num_labels, embedding_dim=64, hidden_dim=128):
        super(RNNParser, self).__init__()
        self.embedding = nn.Embedding(num_labels + 1, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_labels + 1) # Add +1 here to match the embedding layer output

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        output, _ = self.rnn(x)
        output = self.fc(output.squeeze(1))
        return output

# Assuming 'dataset' is your TreeDataset instance
model = RNNParser(num_labels=len(dataset.labels))

# Assuming you have a DataLoader named 'dataloader' for your dataset
dataloader = DataLoader(dataset, batch_size=16, shuffle=True) # Example batch size

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ... rest of your training and evaluation code ...

# Train model
epochs = 5
for epoch in range(epochs):
    total_loss = 0
    for parent, child in dataloader:
        optimizer.zero_grad()
        output = model(parent)
        loss = criterion(output, child)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

torch.save(model.state_dict(), "parser_model.pth")
# Evaluate model
def evaluate(model, dataset):
    y_true, y_pred = [], []
    for parent, child in dataset:
        with torch.no_grad():
            output = model(parent.unsqueeze(0))
            predicted = torch.argmax(output, dim=1).item()
        y_true.append(child.item())
        y_pred.append(predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

evaluate(model, dataset)

model = RNNParser(num_labels=len(dataset.labels))
model.load_state_dict(torch.load("parser_model.pth", map_location=torch.device("cpu"), weights_only=True))
model.eval()

# Streamlit UI
st.title("Constituency Parser with RNN")
sentence = st.text_input("Enter a sentence:")
if st.button("Parse"):
    if sentence:
        with torch.no_grad():
            input_tensor = dataset.parse_sentence(sentence)
            output = model(input_tensor.unsqueeze(0))
            predicted_idx = torch.argmax(output, dim=1).tolist()
            predicted_labels = [dataset.idx2label[idx] for idx in predicted_idx]
        st.write("Predicted Parse Tree:", predicted_labels)
    else:
        st.warning("Please enter a valid sentence.")



Overwriting app.py


In [38]:
torch.save(model.state_dict(), "parser_model.pth")

In [44]:
!pip install pyngrok
!streamlit run app.py --server.headless true & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.143.141.115:8501[0m
[0m
your url is: https://stupid-ideas-yawn.loca.lt
[34m  Stopping...[0m
^C


In [27]:


# Download the Treebank corpus
nltk.download("treebank")

# Load the dataset properly
treebank = list(nltk.corpus.treebank.parsed_sents())  # Convert to list



[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [28]:
def tree_to_tuples(tree):
    tuples = []
    if isinstance(tree, Tree):
        for subtree in tree:
            tuples.append((tree.label(), subtree.label() if isinstance(subtree, Tree) else subtree))
            tuples.extend(tree_to_tuples(subtree))
    return tuples

In [29]:
class TreeDataset(Dataset):
    def __init__(self, trees):
        self.data = []
        for tree in trees:
            self.data.extend(tree_to_tuples(tree))
        self.labels = list(set([x[0] for x in self.data] + [x[1] for x in self.data if isinstance(x[1], str)]))
        self.label2idx = {label: idx for idx, label in enumerate(self.labels)}
        self.idx2label = {idx: label for label, idx in self.label2idx.items()}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        parent, child = self.data[idx]
        parent_idx = self.label2idx[parent]
        child_idx = self.label2idx.get(child, len(self.label2idx) - 1)  # Ensure index is valid
        return torch.tensor(parent_idx), torch.tensor(child_idx)


In [30]:
class RNNParser(nn.Module):
    def __init__(self, num_labels, embedding_dim=64, hidden_dim=128):
        super(RNNParser, self).__init__()
        self.embedding = nn.Embedding(num_labels + 1, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_labels + 1) # Add +1 here to match the embedding layer output

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        output, _ = self.rnn(x)
        output = self.fc(output.squeeze(1))
        return output

# Assuming 'dataset' is your TreeDataset instance
model = RNNParser(num_labels=len(dataset.labels))

# Assuming you have a DataLoader named 'dataloader' for your dataset
dataloader = DataLoader(dataset, batch_size=16, shuffle=True) # Example batch size

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ... rest of your training and evaluation code ...

# Train model
epochs = 5
for epoch in range(epochs):
    total_loss = 0
    for parent, child in dataloader:
        optimizer.zero_grad()
        output = model(parent)
        loss = criterion(output, child)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

torch.save(model.state_dict(), "parser_model.pth")
# Evaluate model
def evaluate(model, dataset):
    y_true, y_pred = [], []
    for parent, child in dataset:
        with torch.no_grad():
            output = model(parent.unsqueeze(0))
            predicted = torch.argmax(output, dim=1).item()
        y_true.append(child.item())
        y_pred.append(predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

evaluate(model, dataset)

Epoch 1, Loss: 28726.9507
Epoch 2, Loss: 27795.2133
Epoch 3, Loss: 27637.3043
Epoch 4, Loss: 27548.0148
Epoch 5, Loss: 27505.8876
Precision: 0.0301, Recall: 0.0216, F1-score: 0.0212


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
model = RNNParser(num_labels=len(dataset.labels))
model.load_state_dict(torch.load("parser_model.pth", map_location=torch.device("cpu")))
model.eval()

# Streamlit UI
st.title("Constituency Parser with RNN")
sentence = st.text_input("Enter a sentence:")
if st.button("Parse"):
    if sentence:
        with torch.no_grad():
            input_tensor = dataset.parse_sentence(sentence)
            output = model(input_tensor.unsqueeze(0))
            predicted_idx = torch.argmax(output, dim=1).tolist()
            predicted_labels = [dataset.idx2label[idx] for idx in predicted_idx]
        st.write("Predicted Parse Tree:", predicted_labels)
    else:
        st.warning("Please enter a valid sentence.")

  model.load_state_dict(torch.load("parser_model.pth", map_location=torch.device("cpu")))
