In [1]:
!git clone https://github.com/djwackey/chinese-hiphop-lyrics.git
import os,json

allhiphop = os.walk('./chinese-hiphop-lyrics/')
songs = []
for root_dir,_,hiphops in allhiphop:
    for hiphop in hiphops:
       if hiphop.endswith(".json"):
        song_path = f'{root_dir}/{hiphop}'
        with open(song_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                for song in data['songs']:
                    songs.append(song)

fatal: destination path 'chinese-hiphop-lyrics' already exists and is not an empty directory.


In [2]:

file_path = 'word.txt'
for song in songs:
    # print(song)
    text = song['lyrics']
    lines = 'EOS'.join(text)
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(lines)


## Markov Chain Model

In [3]:
import random
import jieba  # Used for Chinese word segmentation

def read_file(file_path):
    # Read and return text from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def preprocess_text(text):
    # Segment the text into words using jieba and return the list of words
    words = list(jieba.cut(text))
    return words

def build_markov_chain(words):
    # Build a Markov chain dictionary where each word points to a list of words that come after it
    markov_chain = {}
    for i in range(len(words) - 1):
        word = words[i]
        next_word = words[i + 1]
        if word not in markov_chain:
            markov_chain[word] = []
        markov_chain[word].append(next_word)
    return markov_chain

def generate_text(chain, length=20):
    # Generate a text of specified length using the Markov chain
    word1 = random.choice(list(chain.keys()))
    result = [word1]
    for _ in range(length - 1):
        word2 = random.choice(chain[word1])
        result.append(word2)
        word1 = word2
    return ''.join(result)

file_path = './word.txt'  # Replace with your file path
text = read_file(file_path)  # Read the text from the file
words = preprocess_text(text)  # Preprocess the text for Markov chain
chain = build_markov_chain(words)  # Build the Markov chain
generated_text = generate_text(chain, 50)  # Generate text of length 50
print(generated_text)  # Print the generated text

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 0.805 seconds.
DEBUG:jieba:Loading model cost 0.805 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


westside
超脱后悔过过招像日本出品嘿 世世你的路EOS在梦EOS加入我们都为她吃
以前我制造 导步EOS人生被雪覆盖整片海域因为我的路EOS谁会跌倒的 can 


In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Read text data
file_path = 'word.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize using jieba
words = list(jieba.cut(text))

# Build vocabulary
vocab = set(words)
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for idx, word in enumerate(vocab)}

# Convert text to index sequences
sequences = [word_to_index[word] for word in words]

# Create sequences and labels
sequence_length = 10  # Set the length of each sequence
sequences_input = []
labels = []

for i in range(len(sequences) - sequence_length):
    seq = sequences[i:i + sequence_length]
    label = sequences[i + sequence_length]
    sequences_input.append(seq)
    labels.append(label)

# Convert to PyTorch tensors
sequences_input = torch.tensor(sequences_input, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.long)

# Split into training and testing sets
split_ratio = 0.8  # 80% for training, 20% for testing
split_index = int(len(sequences_input) * split_ratio)

train_sequences = sequences_input[:split_index]
train_labels = labels[:split_index]

test_sequences = sequences_input[split_index:]
test_labels = labels[split_index:]

# Create data loaders
batch_size = 2048
train_dataset = TensorDataset(train_sequences, train_labels)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_sequences, test_labels)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [5]:
label

2616

In [6]:
# 查看训练数据和测试数据的一个批次
for batch_inputs, batch_labels in train_data_loader:
    # print("Train Batch Inputs:", batch_inputs)
    print("Train Batch Labels:", batch_labels)
    break

for batch_inputs, batch_labels in test_data_loader:
    # print("Test Batch Inputs:", batch_inputs)
    print("Test Batch Labels:", batch_labels)
    break

Train Batch Labels: tensor([ 9867, 12631, 11812, 35445, 40129, 37345, 20891, 36478, 11976, 37050,
        15001, 72604, 37882, 11976, 37882, 65587, 11976, 76178, 32887, 58246,
        18688, 79590, 67757, 11976, 47600, 57095, 72697, 61719, 70323, 36247,
        43138, 42512, 54293,  4955, 35472, 37882, 72281, 32172, 45145, 23669,
        34088, 23066, 64239, 11976, 42299, 12631, 11976, 40428, 26270, 11976,
        11976, 62139, 39322, 67032,  3584, 42668, 42967, 21727, 21951, 47010,
        38576, 32019,  5731, 23890])
Test Batch Labels: tensor([32956, 11976, 64798, 11976, 59542, 11976, 32956, 11976, 31082, 11976,
        38242, 11976, 32956, 11976, 49725, 45333, 17764, 22665, 11976, 24118,
        11976, 20865, 11976, 43846, 17764, 12631, 22463, 72697,  3319, 71515,
        17764, 12631, 46343, 72697,  3319, 71515, 17764, 13011, 23433, 43696,
        72697,  3319, 71515, 17764, 68958, 67032,  2326, 15045, 11976, 67963,
        28434, 35703, 17764, 74316, 70005, 54693, 65863, 11976,  6

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

# Assume you already have processed data: 'train_sequences', 'train_labels', 'test_sequences', 'test_labels'
# Representing input sequences and labels for the training and testing sets

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.lstm(x)
        output = self.fc(output[:, -1, :])
        return output

# Hyperparameters
vocab_size = len(index_to_word)  # Assuming vocabulary size is 10000
embedding_dim = 128  # Embedding dimension
hidden_dim = 256  # LSTM hidden layer dimension
output_dim = vocab_size  # Output dimension same as vocabulary size
sequence_length = 10  # Set the length of each sequence

# Instantiate the model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)

# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert to PyTorch tensors
train_sequences = torch.tensor(train_sequences, dtype=torch.long)
train_labels = torch.tensor(train_labels, dtype=torch.long)

test_sequences = torch.tensor(test_sequences, dtype=torch.long)
test_labels = torch.tensor(test_labels, dtype=torch.long)

# Create data loaders
batch_size = 2048
train_dataset = TensorDataset(train_sequences, train_labels)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_sequences, test_labels)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training and validation
num_epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

total_iterations = 0  # Iteration counter
for epoch in range(num_epochs):
    # Training phase
    model.train()
    total_loss = 0.0
    for batch_inputs, batch_labels in tqdm(train_data_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Training', position=0):
        total_iterations += 1
        batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = loss_function(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Output training loss and perplexity every 100 iterations
        if total_iterations % 100 == 0:
            avg_loss = total_loss / 100
            perplexity = torch.exp(torch.tensor(avg_loss))
            tqdm.write(f'Iteration {total_iterations}, Training Loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}')
            total_loss = 0.0

    # Validation phase
    model.eval()
    with torch.no_grad():
        total_correct = 0
        total_samples = 0
        for batch_inputs, batch_labels in tqdm(test_data_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Validation'):
            batch_inputs, batch_labels = batch_inputs.to(device), batch_labels.to(device)
            outputs = model(batch_inputs)
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == batch_labels).sum().item()
            total_samples += batch_labels.size(0)

        accuracy = total_correct / total_samples
        tqdm.write(f'Validation Accuracy: {accuracy * 100:.2f}%')

print('Training complete.')


  train_sequences = torch.tensor(train_sequences, dtype=torch.long)
  train_labels = torch.tensor(train_labels, dtype=torch.long)
  test_sequences = torch.tensor(test_sequences, dtype=torch.long)
  test_labels = torch.tensor(test_labels, dtype=torch.long)
Epoch 1/3 - Training:   3%|▎         | 102/3625 [00:06<03:04, 19.09it/s]

Iteration 100, Training Loss: 7.5969, Perplexity: 1991.9426


Epoch 1/3 - Training:   6%|▌         | 202/3625 [00:12<02:59, 19.12it/s]

Iteration 200, Training Loss: 6.6152, Perplexity: 746.3846


Epoch 1/3 - Training:   8%|▊         | 303/3625 [00:18<05:09, 10.74it/s]

Iteration 300, Training Loss: 6.4147, Perplexity: 610.7785


Epoch 1/3 - Training:  11%|█         | 403/3625 [00:26<03:00, 17.89it/s]

Iteration 400, Training Loss: 6.2830, Perplexity: 535.3988


Epoch 1/3 - Training:  14%|█▍        | 502/3625 [00:34<07:04,  7.36it/s]

Iteration 500, Training Loss: 6.1567, Perplexity: 471.8893


Epoch 1/3 - Training:  17%|█▋        | 602/3625 [00:42<02:54, 17.36it/s]

Iteration 600, Training Loss: 6.0824, Perplexity: 438.0866


Epoch 1/3 - Training:  19%|█▉        | 703/3625 [00:51<08:04,  6.03it/s]

Iteration 700, Training Loss: 6.0357, Perplexity: 418.0971


Epoch 1/3 - Training:  22%|██▏       | 801/3625 [01:00<04:16, 11.03it/s]

Iteration 800, Training Loss: 5.9399, Perplexity: 379.8969


Epoch 1/3 - Training:  25%|██▍       | 903/3625 [01:09<02:26, 18.54it/s]

Iteration 900, Training Loss: 5.8952, Perplexity: 363.3045


Epoch 1/3 - Training:  28%|██▊       | 1002/3625 [01:17<03:41, 11.82it/s]

Iteration 1000, Training Loss: 5.8417, Perplexity: 344.3690


Epoch 1/3 - Training:  30%|███       | 1103/3625 [01:25<02:14, 18.71it/s]

Iteration 1100, Training Loss: 5.7882, Perplexity: 326.4131


Epoch 1/3 - Training:  33%|███▎      | 1202/3625 [01:34<03:36, 11.17it/s]

Iteration 1200, Training Loss: 5.7523, Perplexity: 314.9085


Epoch 1/3 - Training:  36%|███▌      | 1302/3625 [01:41<02:09, 17.90it/s]

Iteration 1300, Training Loss: 5.6863, Perplexity: 294.8065


Epoch 1/3 - Training:  39%|███▊      | 1402/3625 [01:50<03:24, 10.87it/s]

Iteration 1400, Training Loss: 5.6355, Perplexity: 280.2106


Epoch 1/3 - Training:  41%|████▏     | 1502/3625 [01:58<02:41, 13.14it/s]

Iteration 1500, Training Loss: 5.5641, Perplexity: 260.8835


Epoch 1/3 - Training:  44%|████▍     | 1600/3625 [02:06<01:50, 18.34it/s]

Iteration 1600, Training Loss: 5.5339, Perplexity: 253.1357


Epoch 1/3 - Training:  47%|████▋     | 1702/3625 [02:15<01:53, 16.91it/s]

Iteration 1700, Training Loss: 5.4713, Perplexity: 237.7599


Epoch 1/3 - Training:  50%|████▉     | 1801/3625 [02:22<01:40, 18.15it/s]

Iteration 1800, Training Loss: 5.4179, Perplexity: 225.3998


Epoch 1/3 - Training:  52%|█████▏    | 1903/3625 [02:31<01:52, 15.28it/s]

Iteration 1900, Training Loss: 5.3633, Perplexity: 213.4359


Epoch 1/3 - Training:  55%|█████▌    | 2003/3625 [02:39<01:49, 14.81it/s]

Iteration 2000, Training Loss: 5.3220, Perplexity: 204.7865


Epoch 1/3 - Training:  58%|█████▊    | 2103/3625 [02:48<01:48, 13.98it/s]

Iteration 2100, Training Loss: 5.2738, Perplexity: 195.1519


Epoch 1/3 - Training:  61%|██████    | 2202/3625 [02:56<01:20, 17.69it/s]

Iteration 2200, Training Loss: 5.2252, Perplexity: 185.8950


Epoch 1/3 - Training:  64%|██████▎   | 2303/3625 [03:04<02:00, 10.93it/s]

Iteration 2300, Training Loss: 5.1595, Perplexity: 174.0737


Epoch 1/3 - Training:  66%|██████▋   | 2403/3625 [03:12<01:07, 18.03it/s]

Iteration 2400, Training Loss: 5.1105, Perplexity: 165.7594


Epoch 1/3 - Training:  69%|██████▉   | 2501/3625 [03:21<03:11,  5.87it/s]

Iteration 2500, Training Loss: 5.0671, Perplexity: 158.7203


Epoch 1/3 - Training:  72%|███████▏  | 2603/3625 [03:28<01:00, 16.81it/s]

Iteration 2600, Training Loss: 5.0159, Perplexity: 150.7880


Epoch 1/3 - Training:  75%|███████▍  | 2702/3625 [03:36<00:49, 18.54it/s]

Iteration 2700, Training Loss: 4.9687, Perplexity: 143.8364


Epoch 1/3 - Training:  77%|███████▋  | 2803/3625 [03:45<00:52, 15.67it/s]

Iteration 2800, Training Loss: 4.9266, Perplexity: 137.9134


Epoch 1/3 - Training:  80%|████████  | 2903/3625 [03:53<00:39, 18.48it/s]

Iteration 2900, Training Loss: 4.8686, Perplexity: 130.1435


Epoch 1/3 - Training:  83%|████████▎ | 3002/3625 [04:02<01:04,  9.63it/s]

Iteration 3000, Training Loss: 4.8245, Perplexity: 124.5181


Epoch 1/3 - Training:  86%|████████▌ | 3103/3625 [04:09<00:28, 18.46it/s]

Iteration 3100, Training Loss: 4.7850, Perplexity: 119.7050


Epoch 1/3 - Training:  88%|████████▊ | 3203/3625 [04:18<00:34, 12.25it/s]

Iteration 3200, Training Loss: 4.7538, Perplexity: 116.0269


Epoch 1/3 - Training:  91%|█████████ | 3303/3625 [04:26<00:17, 17.89it/s]

Iteration 3300, Training Loss: 4.6955, Perplexity: 109.4490


Epoch 1/3 - Training:  94%|█████████▍| 3403/3625 [04:35<00:23,  9.41it/s]

Iteration 3400, Training Loss: 4.6663, Perplexity: 106.3082


Epoch 1/3 - Training:  97%|█████████▋| 3502/3625 [04:42<00:08, 14.06it/s]

Iteration 3500, Training Loss: 4.6127, Perplexity: 100.7510


Epoch 1/3 - Training:  99%|█████████▉| 3603/3625 [04:51<00:02,  7.88it/s]

Iteration 3600, Training Loss: 4.5558, Perplexity: 95.1810


Epoch 1/3 - Training: 100%|██████████| 3625/3625 [04:52<00:00, 12.37it/s]
Epoch 1/3 - Validation: 100%|██████████| 907/907 [00:49<00:00, 18.41it/s]


Validation Accuracy: 29.30%


Epoch 2/3 - Training:   2%|▏         | 77/3625 [00:06<04:21, 13.59it/s]

Iteration 3700, Training Loss: 3.3021, Perplexity: 27.1709


Epoch 2/3 - Training:   5%|▍         | 177/3625 [00:15<03:50, 14.95it/s]

Iteration 3800, Training Loss: 4.3666, Perplexity: 78.7785


Epoch 2/3 - Training:   8%|▊         | 277/3625 [00:24<03:46, 14.78it/s]

Iteration 3900, Training Loss: 4.3447, Perplexity: 77.0708


Epoch 2/3 - Training:  10%|█         | 378/3625 [00:32<04:16, 12.64it/s]

Iteration 4000, Training Loss: 4.2992, Perplexity: 73.6429


Epoch 2/3 - Training:  13%|█▎        | 478/3625 [00:40<02:56, 17.83it/s]

Iteration 4100, Training Loss: 4.2712, Perplexity: 71.6072


Epoch 2/3 - Training:  16%|█▌        | 578/3625 [00:49<05:20,  9.51it/s]

Iteration 4200, Training Loss: 4.2431, Perplexity: 69.6264


Epoch 2/3 - Training:  19%|█▊        | 678/3625 [00:57<02:50, 17.24it/s]

Iteration 4300, Training Loss: 4.2170, Perplexity: 67.8290


Epoch 2/3 - Training:  21%|██▏       | 774/3625 [01:07<04:01, 11.82it/s]

Iteration 4400, Training Loss: 4.1758, Perplexity: 65.0932


Epoch 2/3 - Training:  24%|██▍       | 877/3625 [01:18<03:24, 13.42it/s]

Iteration 4500, Training Loss: 4.1481, Perplexity: 63.3133


Epoch 2/3 - Training:  27%|██▋       | 975/3625 [01:26<02:26, 18.09it/s]

Iteration 4600, Training Loss: 4.1069, Perplexity: 60.7573


Epoch 2/3 - Training:  30%|██▉       | 1077/3625 [01:35<03:04, 13.82it/s]

Iteration 4700, Training Loss: 4.0882, Perplexity: 59.6339


Epoch 2/3 - Training:  32%|███▏      | 1178/3625 [01:42<02:11, 18.56it/s]

Iteration 4800, Training Loss: 4.0625, Perplexity: 58.1221


Epoch 2/3 - Training:  35%|███▌      | 1277/3625 [01:51<02:49, 13.84it/s]

Iteration 4900, Training Loss: 4.0203, Perplexity: 55.7200


Epoch 2/3 - Training:  38%|███▊      | 1377/3625 [01:59<02:37, 14.28it/s]

Iteration 5000, Training Loss: 4.0053, Perplexity: 54.8904


Epoch 2/3 - Training:  41%|████      | 1477/3625 [02:08<03:52,  9.23it/s]

Iteration 5100, Training Loss: 3.9773, Perplexity: 53.3701


Epoch 2/3 - Training:  44%|████▎     | 1577/3625 [02:16<01:54, 17.86it/s]

Iteration 5200, Training Loss: 3.9356, Perplexity: 51.1919


Epoch 2/3 - Training:  46%|████▋     | 1677/3625 [02:25<03:59,  8.13it/s]

Iteration 5300, Training Loss: 3.9192, Perplexity: 50.3624


Epoch 2/3 - Training:  49%|████▉     | 1778/3625 [02:33<01:50, 16.77it/s]

Iteration 5400, Training Loss: 3.8968, Perplexity: 49.2435


Epoch 2/3 - Training:  52%|█████▏    | 1876/3625 [02:41<01:57, 14.93it/s]

Iteration 5500, Training Loss: 3.8764, Perplexity: 48.2519


Epoch 2/3 - Training:  55%|█████▍    | 1977/3625 [02:50<01:44, 15.76it/s]

Iteration 5600, Training Loss: 3.8398, Perplexity: 46.5182


Epoch 2/3 - Training:  57%|█████▋    | 2077/3625 [02:58<01:24, 18.26it/s]

Iteration 5700, Training Loss: 3.8269, Perplexity: 45.9208


Epoch 2/3 - Training:  60%|██████    | 2177/3625 [03:06<01:57, 12.37it/s]

Iteration 5800, Training Loss: 3.7819, Perplexity: 43.9007


Epoch 2/3 - Training:  63%|██████▎   | 2277/3625 [03:14<01:13, 18.22it/s]

Iteration 5900, Training Loss: 3.7580, Perplexity: 42.8634


Epoch 2/3 - Training:  66%|██████▌   | 2378/3625 [03:23<02:06,  9.82it/s]

Iteration 6000, Training Loss: 3.7326, Perplexity: 41.7866


Epoch 2/3 - Training:  68%|██████▊   | 2478/3625 [03:30<01:04, 17.79it/s]

Iteration 6100, Training Loss: 3.7157, Perplexity: 41.0881


Epoch 2/3 - Training:  71%|███████   | 2578/3625 [03:40<01:52,  9.30it/s]

Iteration 6200, Training Loss: 3.6917, Perplexity: 40.1150


Epoch 2/3 - Training:  74%|███████▍  | 2678/3625 [03:47<00:54, 17.43it/s]

Iteration 6300, Training Loss: 3.6721, Perplexity: 39.3346


Epoch 2/3 - Training:  77%|███████▋  | 2778/3625 [03:56<01:50,  7.68it/s]

Iteration 6400, Training Loss: 3.6490, Perplexity: 38.4359


Epoch 2/3 - Training:  79%|███████▉  | 2878/3625 [04:04<00:59, 12.64it/s]

Iteration 6500, Training Loss: 3.6207, Perplexity: 37.3635


Epoch 2/3 - Training:  82%|████████▏ | 2978/3625 [04:11<00:34, 18.57it/s]

Iteration 6600, Training Loss: 3.6038, Perplexity: 36.7383


Epoch 2/3 - Training:  85%|████████▍ | 3078/3625 [04:20<00:37, 14.54it/s]

Iteration 6700, Training Loss: 3.5894, Perplexity: 36.2125


Epoch 2/3 - Training:  88%|████████▊ | 3178/3625 [04:28<00:24, 18.61it/s]

Iteration 6800, Training Loss: 3.5556, Perplexity: 35.0099


Epoch 2/3 - Training:  90%|█████████ | 3278/3625 [04:37<00:27, 12.68it/s]

Iteration 6900, Training Loss: 3.5242, Perplexity: 33.9264


Epoch 2/3 - Training:  93%|█████████▎| 3378/3625 [04:45<00:17, 14.16it/s]

Iteration 7000, Training Loss: 3.5091, Perplexity: 33.4174


Epoch 2/3 - Training:  96%|█████████▌| 3478/3625 [04:54<00:14, 10.42it/s]

Iteration 7100, Training Loss: 3.4865, Perplexity: 32.6716


Epoch 2/3 - Training:  99%|█████████▊| 3578/3625 [05:03<00:02, 17.17it/s]

Iteration 7200, Training Loss: 3.4686, Perplexity: 32.0918


Epoch 2/3 - Training: 100%|██████████| 3625/3625 [05:07<00:00, 11.79it/s]
Epoch 2/3 - Validation: 100%|██████████| 907/907 [00:48<00:00, 18.59it/s]


Validation Accuracy: 41.07%


Epoch 3/3 - Training:   1%|▏         | 52/3625 [00:04<04:16, 13.91it/s]

Iteration 7300, Training Loss: 1.6378, Perplexity: 5.1439


Epoch 3/3 - Training:   4%|▍         | 150/3625 [00:12<03:19, 17.45it/s]

Iteration 7400, Training Loss: 3.2711, Perplexity: 26.3397


Epoch 3/3 - Training:   7%|▋         | 252/3625 [00:20<04:02, 13.90it/s]

Iteration 7500, Training Loss: 3.2559, Perplexity: 25.9442


Epoch 3/3 - Training:  10%|▉         | 352/3625 [00:29<02:58, 18.37it/s]

Iteration 7600, Training Loss: 3.2362, Perplexity: 25.4358


Epoch 3/3 - Training:  12%|█▏        | 452/3625 [00:37<04:40, 11.31it/s]

Iteration 7700, Training Loss: 3.2225, Perplexity: 25.0909


Epoch 3/3 - Training:  15%|█▌        | 552/3625 [00:45<02:54, 17.59it/s]

Iteration 7800, Training Loss: 3.2231, Perplexity: 25.1049


Epoch 3/3 - Training:  18%|█▊        | 653/3625 [00:54<04:35, 10.79it/s]

Iteration 7900, Training Loss: 3.1961, Perplexity: 24.4377


Epoch 3/3 - Training:  21%|██        | 752/3625 [01:02<02:43, 17.62it/s]

Iteration 8000, Training Loss: 3.1928, Perplexity: 24.3574


Epoch 3/3 - Training:  24%|██▎       | 852/3625 [01:11<04:59,  9.25it/s]

Iteration 8100, Training Loss: 3.1701, Perplexity: 23.8103


Epoch 3/3 - Training:  26%|██▋       | 952/3625 [01:18<03:29, 12.76it/s]

Iteration 8200, Training Loss: 3.1493, Perplexity: 23.3189


Epoch 3/3 - Training:  29%|██▉       | 1052/3625 [01:26<02:18, 18.58it/s]

Iteration 8300, Training Loss: 3.1443, Perplexity: 23.2024


Epoch 3/3 - Training:  32%|███▏      | 1152/3625 [01:35<02:41, 15.32it/s]

Iteration 8400, Training Loss: 3.1253, Perplexity: 22.7667


Epoch 3/3 - Training:  35%|███▍      | 1252/3625 [01:43<02:10, 18.17it/s]

Iteration 8500, Training Loss: 3.1243, Perplexity: 22.7434


Epoch 3/3 - Training:  37%|███▋      | 1352/3625 [01:52<03:02, 12.42it/s]

Iteration 8600, Training Loss: 3.1083, Perplexity: 22.3833


Epoch 3/3 - Training:  40%|████      | 1452/3625 [02:00<02:29, 14.54it/s]

Iteration 8700, Training Loss: 3.1003, Perplexity: 22.2054


Epoch 3/3 - Training:  43%|████▎     | 1553/3625 [02:08<02:46, 12.46it/s]

Iteration 8800, Training Loss: 3.0754, Perplexity: 21.6593


Epoch 3/3 - Training:  46%|████▌     | 1653/3625 [02:17<01:51, 17.70it/s]

Iteration 8900, Training Loss: 3.0568, Perplexity: 21.2592


Epoch 3/3 - Training:  48%|████▊     | 1752/3625 [02:25<04:04,  7.67it/s]

Iteration 9000, Training Loss: 3.0531, Perplexity: 21.1802


Epoch 3/3 - Training:  51%|█████     | 1853/3625 [02:33<01:42, 17.27it/s]

Iteration 9100, Training Loss: 3.0316, Perplexity: 20.7302


Epoch 3/3 - Training:  54%|█████▍    | 1951/3625 [02:42<06:01,  4.63it/s]

Iteration 9200, Training Loss: 3.0308, Perplexity: 20.7130


Epoch 3/3 - Training:  57%|█████▋    | 2053/3625 [02:50<01:38, 15.88it/s]

Iteration 9300, Training Loss: 3.0062, Perplexity: 20.2111


Epoch 3/3 - Training:  59%|█████▉    | 2153/3625 [02:58<01:20, 18.40it/s]

Iteration 9400, Training Loss: 3.0005, Perplexity: 20.0959


Epoch 3/3 - Training:  62%|██████▏   | 2253/3625 [03:06<01:32, 14.89it/s]

Iteration 9500, Training Loss: 2.9816, Perplexity: 19.7190


Epoch 3/3 - Training:  65%|██████▍   | 2353/3625 [03:15<01:12, 17.58it/s]

Iteration 9600, Training Loss: 2.9699, Perplexity: 19.4897


Epoch 3/3 - Training:  68%|██████▊   | 2453/3625 [03:23<01:55, 10.12it/s]

Iteration 9700, Training Loss: 2.9500, Perplexity: 19.1063


Epoch 3/3 - Training:  70%|███████   | 2553/3625 [03:31<01:01, 17.45it/s]

Iteration 9800, Training Loss: 2.9364, Perplexity: 18.8478


Epoch 3/3 - Training:  73%|███████▎  | 2651/3625 [03:41<02:14,  7.23it/s]

Iteration 9900, Training Loss: 2.9329, Perplexity: 18.7826


Epoch 3/3 - Training:  76%|███████▌  | 2753/3625 [03:49<00:49, 17.45it/s]

Iteration 10000, Training Loss: 2.9261, Perplexity: 18.6541


Epoch 3/3 - Training:  79%|███████▊  | 2849/3625 [03:57<00:42, 18.26it/s]

Iteration 10100, Training Loss: 2.9066, Perplexity: 18.2942


Epoch 3/3 - Training:  81%|████████▏ | 2953/3625 [04:06<00:49, 13.54it/s]

Iteration 10200, Training Loss: 2.8800, Perplexity: 17.8141


Epoch 3/3 - Training:  84%|████████▍ | 3051/3625 [04:13<00:32, 17.53it/s]

Iteration 10300, Training Loss: 2.8831, Perplexity: 17.8701


Epoch 3/3 - Training:  87%|████████▋ | 3153/3625 [04:22<00:31, 14.87it/s]

Iteration 10400, Training Loss: 2.8606, Perplexity: 17.4720


Epoch 3/3 - Training:  90%|████████▉ | 3253/3625 [04:30<00:20, 18.30it/s]

Iteration 10500, Training Loss: 2.8573, Perplexity: 17.4136


Epoch 3/3 - Training:  92%|█████████▏| 3353/3625 [04:39<00:19, 13.61it/s]

Iteration 10600, Training Loss: 2.8409, Perplexity: 17.1306


Epoch 3/3 - Training:  95%|█████████▌| 3451/3625 [04:47<00:12, 13.70it/s]

Iteration 10700, Training Loss: 2.8229, Perplexity: 16.8255


Epoch 3/3 - Training:  98%|█████████▊| 3553/3625 [04:55<00:06, 10.67it/s]

Iteration 10800, Training Loss: 2.8127, Perplexity: 16.6546


Epoch 3/3 - Training: 100%|██████████| 3625/3625 [05:02<00:00, 11.99it/s]
Epoch 3/3 - Validation: 100%|██████████| 907/907 [00:48<00:00, 18.62it/s]

Validation Accuracy: 49.74%
Training complete.





In [8]:
vocab = list(vocab)

In [9]:
vocab[0]

'下论'

In [10]:
torch.save(model.state_dict(), 'model.pth')

In [23]:
import torch
import random
from torch.nn import functional as F

model.eval()

org = [
    "唉哟 P.Q我来自上海的兄弟",
    "What’s up",
     "我的新疆homie",
     "Are you ready",
     "I’m ready",
    "Let’s go",
    "初次去上海只为见心爱的她",
    "南京路上相拥",
    "听跨年钟声嘀嗒",
    "忍不住狂甩对方嘴唇在那一刹",
    "更幸福的是",
    "现在看我们孩子长大",
    "街上都是迪丽热巴和古力娜扎",
    "在乌鲁木齐每一天的太阳",
    "So hot",
    "忘不了那抓饭的香和大盘鸡的辣",
     "我喝了十六次的石榴汁在大巴扎",
     "那年上海夺了冠",
     "狗哥微博发过赞",
     "P.Q带我吃的串",
    "象哥的酒量赢一半"
]

# Function to convert generated indices back to words
def index_to_word(index, vocab):
    return vocab[index]

# Function to generate sentences
def generate_sentence(model, initial_word, vocab, max_length=5):
    model.eval()
    # model.to('cpu')
    # Convert the initial word to an index
    current_word = torch.tensor([word_to_index[initial_word]], dtype=torch.long).unsqueeze(0).to(device)

    generated_sentence = [initial_word]

    with torch.no_grad():
        for _ in range(max_length):
            output = model(current_word)
            probabilities = F.softmax(output, dim=1).squeeze(0)
            predicted_index = torch.multinomial(probabilities, 1).item()
            predicted_word = index_to_word(predicted_index, vocab)

            if 'EOS' in predicted_word:
                break

            generated_sentence.append(predicted_word)
            current_word = torch.tensor([predicted_index], dtype=torch.long).unsqueeze(0).to(device)

    return ' '.join(generated_sentence)

# Test sentence generation
generated_sentences = []
for org_sentence in org:
    initial_word = org_sentence[0]  # Use the first character of each sentence as the initial word
    generated_sentence = generate_sentence(model, initial_word, vocab)
    generated_sentences.append(generated_sentence)

# Print the generated sentences
for org_sentence, gen_sentence in zip(org, generated_sentences):
    print(f"Original: {org_sentence}")
    print(f"Generated: {gen_sentence}")
    print("-" * 50)


Original: 唉哟 P.Q我来自上海的兄弟
Generated: 唉 千奇百怪 相处 阴晴圆 浓郁 Whiskey
--------------------------------------------------
Original: What’s up
Generated: W koralmaysan
--------------------------------------------------
Original: 我的新疆homie
Generated: 我 握紧 卡卡 先人 四合
--------------------------------------------------
Original: Are you ready
Generated: A 喷气式 我梦里 思绪 太过火
--------------------------------------------------
Original: I’m ready
Generated: I 胡言
--------------------------------------------------
Original: Let’s go
Generated: L 笨重
--------------------------------------------------
Original: 初次去上海只为见心爱的她
Generated: 初 脑洞 鲜 静静的 Do GogoBoi
--------------------------------------------------
Original: 南京路上相拥
Generated: 南 走險 里路 淫荡 照料 视频
--------------------------------------------------
Original: 听跨年钟声嘀嗒
Generated: 听 牺牲品 走私犯 fendi 盘膝 全班
--------------------------------------------------
Original: 忍不住狂甩对方嘴唇在那一刹
Generated: 忍 rolling 摧残 DIZZY 除病 不择手段
--------------------------------------------------
Or

In [25]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Tokenize the original and generated sentences
tokenized_org = [nltk.word_tokenize(sentence.lower()) for sentence in org]
tokenized_gen = [nltk.word_tokenize(sentence.lower()) for sentence in generated_sentences]

# Calculate BLEU score with lower n-gram order (e.g., 1-gram)
bleu_scores = [sentence_bleu([tokens], generated_tokens, weights=(4, 0, 0, 0)) for tokens, generated_tokens in zip(tokenized_org, tokenized_gen)]

# Print BLEU scores
for org_sentence, gen_sentence, bleu_score in zip(org, generated_sentences, bleu_scores):
    print(f"Original: {org_sentence}")
    print(f"Generated: {gen_sentence}")
    print(f"BLEU Score: {bleu_score:.4f}")
    print("-" * 50)


Original: 唉哟 P.Q我来自上海的兄弟
Generated: 唉 千奇百怪 相处 阴晴圆 浓郁 Whiskey
BLEU Score: 0.0000
--------------------------------------------------
Original: What’s up
Generated: W koralmaysan
BLEU Score: 0.0000
--------------------------------------------------
Original: 我的新疆homie
Generated: 我 握紧 卡卡 先人 四合
BLEU Score: 0.0000
--------------------------------------------------
Original: Are you ready
Generated: A 喷气式 我梦里 思绪 太过火
BLEU Score: 0.0000
--------------------------------------------------
Original: I’m ready
Generated: I 胡言
BLEU Score: 0.0230
--------------------------------------------------
Original: Let’s go
Generated: L 笨重
BLEU Score: 0.0000
--------------------------------------------------
Original: 初次去上海只为见心爱的她
Generated: 初 脑洞 鲜 静静的 Do GogoBoi
BLEU Score: 0.0000
--------------------------------------------------
Original: 南京路上相拥
Generated: 南 走險 里路 淫荡 照料 视频
BLEU Score: 0.0000
--------------------------------------------------
Original: 听跨年钟声嘀嗒
Generated: 听 牺牲品 走私犯 fendi 盘膝 全班
BLEU Score: 0.

In [22]:
tokenized_org

[['唉哟', 'p.q我来自上海的兄弟'],
 ['what', '’', 's', 'up'],
 ['我的新疆homie'],
 ['are', 'you', 'ready'],
 ['i', '’', 'm', 'ready'],
 ['let', '’', 's', 'go'],
 ['初次去上海只为见心爱的她'],
 ['南京路上相拥'],
 ['听跨年钟声嘀嗒'],
 ['忍不住狂甩对方嘴唇在那一刹'],
 ['更幸福的是'],
 ['现在看我们孩子长大'],
 ['街上都是迪丽热巴和古力娜扎'],
 ['在乌鲁木齐每一天的太阳'],
 ['so', 'hot'],
 ['忘不了那抓饭的香和大盘鸡的辣'],
 ['我喝了十六次的石榴汁在大巴扎'],
 ['那年上海夺了冠'],
 ['狗哥微博发过赞'],
 ['p.q带我吃的串'],
 ['象哥的酒量赢一半']]

In [12]:
from sklearn.model_selection import train_test_split

# 假设 X_padded 和 y 是你的完整数据集
# X_padded: 输入序列
# y: 目标标签

# 设置切分比例，例如 80% 训练数据，20% 验证数据
train_size = 0.8

# 切分数据
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, train_size=train_size, random_state=42)

# 打印切分后数据的大小
print("训练集大小:", X_train.shape)
print("验证集大小:", X_val.shape)


NameError: ignored

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

class LyricsDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# LSTM 模型
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.lstm(x)
        output = self.fc(output[:, -1, :])
        return output

In [None]:
vocab_size = len(word_to_index)
embedding_dim = 512  # 嵌入维度
hidden_dim = 256  # LSTM 隐藏层维度
output_dim = vocab_size  # 输出维度与词汇表大小相同

In [None]:
vocab_size

In [None]:
# 准备数据加载器
batch_size = 1024
# 准备数据加载器
train_dataset = LyricsDataset(X_train, y_train)
val_dataset = LyricsDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 实例化模型
model = LSTMModel(vocab_size+1, embedding_dim, hidden_dim, output_dim)
model.to(device)

# 定义损失函数和优化器
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10

# 训练和评估模型
def evaluate_model(model, data_loader, loss_function):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = loss_function(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(data_loader)

# from tqdm import tqdm

# 在训练循环外部初始化 tqdm 进度条
# epoch_bar = tqdm(range(num_epochs), desc="Epochs")

from tqdm import tqdm
# epoch_bar = tqdm(range(num_epochs), total=num_epochs,desc="Epochs", position=0)
# 在训练循环外部初始化 tqdm 进度条
for epoch in range(num_epochs):
    model.train()

    # 在 train_loader 外部初始化 tqdm 进度条
    train_iterator = tqdm(enumerate(train_loader), total=len(train_loader), desc="Training Iterations", position=0)

    for i,(inputs, targets) in train_iterator:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            # 打印信息的部分保留，如果需要的话
            train_iterator.set_postfix({'desc':f"Epoch {epoch+1}, Iteration {i}, Train Loss: {loss},  Perplexity: {np.exp(loss.item())}"})

    # 关闭 train_iterator 进度条
#     train_iterator.close()

    # 在验证集上评估模型
    # train_loss = evaluate_model(model, train_loader, loss_function)
    val_loss = evaluate_model(model, val_loader, loss_function)

    # 打印信息的部分保留，如果需要的话
    print(f"Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss} ,Perplexity: {np.exp(val_loss)}")

In [None]:
!pip install tqdm


In [None]:
text[:30]

### LSTM

In [None]:
import jieba
import torch
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def preprocess_text(text):
    words = list(jieba.cut(text))
    return words

def create_vocab(words):
    word_counts = Counter(words)
    vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    return word_to_idx, idx_to_word

def create_sequences(words, word_to_idx, seq_length=20):
    sequences = []
    labels = []
    for i in range(len(words) - seq_length):
        seq = words[i:i+seq_length]
        label = words[i+seq_length]
        sequences.append([word_to_idx[word] for word in seq])
        labels.append(word_to_idx[label])
    return sequences, labels

# 读取和预处理文本
file_path = 'word.txt'  # 替换为你的文件路径
text = read_file(file_path)
words = preprocess_text(text)

# 创建词汇表和索引
word_to_idx, idx_to_word = create_vocab(words)

# 创建序列和标签
seq_length = 20  # 可根据需要调整序列长度
sequences, labels = create_sequences(words, word_to_idx, seq_length)

# 将序列和标签转换为 PyTorch 张量
sequences = torch.tensor(sequences, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.long)


In [None]:
labels,sequences

In [None]:
vocab_size = len(idx_to_word)
print("Vocabulary size:", vocab_size)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 假设您已经有了处理过的数据：'sequences' 和 'labels'
# sequences: 输入序列的集合
# labels: 每个序列的下一个词的标签

class LyricsDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# LSTM 模型
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.lstm(x)
        output = self.fc(output[:, -1, :])
        return output

# 设置超参数
# vocab_size = vocab_size  # 假设词汇表大小为 10000
embedding_dim = 128  # 嵌入维度
hidden_dim = 256  # LSTM 隐藏层维度
output_dim = vocab_size  # 输出维度与词汇表大小相同

# 实例化模型
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)

# 定义损失函数和优化器
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 准备数据加载器
batch_size = 64
dataset = LyricsDataset(sequences, labels)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    for inputs, targets in data_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
