<a href="https://colab.research.google.com/github/MuhammadShavaiz/AI_learning/blob/main/Sentence_Boundary_Detection_revision_needed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import numpy as np
import nltk
from nltk.corpus import inaugural
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

# Download the inaugural speech dataset
nltk.download('inaugural')


[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!


True

In [52]:
# Load text data
text_data = inaugural.raw('2009-Obama.txt')

# Create a mapping from characters to integers
chars = sorted(list(set(text_data)))
char_to_int = {c: i for i, c in enumerate(chars)}
num_classes = len(chars)

# Create sequences and labels
def create_sequences(text, seq_length=100):
    sequences = []
    labels = []
    for i in range(len(text) - seq_length):
        seq = text[i:i + seq_length]
        label = 1 if text[i + seq_length] in '.!?;' else 0  # Sentence-ending punctuation
        sequences.append(seq)
        labels.append(label)
    return sequences, labels

sequences, labels = create_sequences(text_data)

# One-hot encode the sequences
def one_hot_encode(sequences, char_to_int, num_classes):
    one_hot_encoded = []
    for seq in sequences:
        one_hot_seq = np.zeros((len(seq), num_classes))
        for j, char in enumerate(seq):
            one_hot_seq[j, char_to_int[char]] = 1
        one_hot_encoded.append(one_hot_seq)
    return np.array(one_hot_encoded)

X = one_hot_encode(sequences, char_to_int, num_classes)
y = np.array(labels)

# Reshape for CNN input
X = X.reshape((X.shape[0], X.shape[1], num_classes))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [53]:
X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)
y_train_tensor = torch.FloatTensor(y_train)
y_test_tensor = torch.FloatTensor(y_test)


In [62]:
class SentenceBoundaryCNN(nn.Module):
    def __init__(self, num_classes):
        super(SentenceBoundaryCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=num_classes, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)

        # Calculate the size after convolutions and pooling
        self.fc1_input_size = 64 * 25  # Based on observed dimensions
        self.fc1 = nn.Linear(self.fc1_input_size, 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Change shape to (batch_size, num_classes, seq_length)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

In [63]:
# Initialize model, criterion, and optimizer
model = SentenceBoundaryCNN(num_classes=num_classes)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(X_train_tensor).view(-1)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()  # Backward pass
    optimizer.step()  # Optimize

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [10/100], Loss: 0.0656
Epoch [20/100], Loss: 0.0924
Epoch [30/100], Loss: 0.0739
Epoch [40/100], Loss: 0.0582
Epoch [50/100], Loss: 0.0603
Epoch [60/100], Loss: 0.0580
Epoch [70/100], Loss: 0.0583
Epoch [80/100], Loss: 0.0579
Epoch [90/100], Loss: 0.0579
Epoch [100/100], Loss: 0.0578


In [64]:
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor).view(-1)
    y_pred_labels = (y_pred > 0.5).float()

# Calculate accuracy
accuracy = (y_pred_labels == y_test_tensor).float().mean()
print(f'Test Accuracy: {accuracy.item():.4f}')

Test Accuracy: 0.9891
