In [None]:
import re
from collections import defaultdict

class Tokenizer:
    def __init__(self):
        self.word_index = {}
        self.index_word = {}
        self.current_index = 1

    def tokenize(self, text):
        tokens = re.findall(r'\b\w+\b', text.lower())
        return tokens

    def fit_on_texts(self, texts):
        for text in texts:
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.word_index:
                    self.word_index[token] = self.current_index
                    self.index_word[self.current_index] = token
                    self.current_index += 1

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            tokens = self.tokenize(text)
            sequence = [self.word_index[token] for token in tokens if token in self.word_index]
            sequences.append(sequence)
        return sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = ' '.join([self.index_word[index] for index in sequence])
            texts.append(text)
        return texts

In [None]:
print(Tokenizer().fit_on_texts(["I love machine learning", "Machine learning is fun"]))

None


In [None]:
class BigramLanguageModel:
    def __init__(self):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.total_counts = defaultdict(int)

    def train(self, sequences):
        for sequence in sequences:
            for i in range(len(sequence) - 1):
                current_word = sequence[i]
                next_word = sequence[i + 1]
                self.bigram_counts[current_word][next_word] += 1
                self.total_counts[current_word] += 1

    def predict_next_word(self, current_word):
        if current_word not in self.bigram_counts:
            return None
        next_words = self.bigram_counts[current_word]
        return max(next_words, key=next_words.get)

In [None]:
# 示例文本
texts = [
    "I love machine learning",
    "Machine learning is fun",
    "I love coding",
    "Coding is great"
]

# 初始化Tokenizer并拟合文本
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# 将文本转换为序列
sequences = tokenizer.texts_to_sequences(texts)

# 初始化语言模型并训练
model = BigramLanguageModel()
model.train(sequences)

# 测试模型
current_word = tokenizer.word_index['i']
next_word_index = model.predict_next_word(current_word)
next_word = tokenizer.index_word[next_word_index]

print(f"The next word after 'i' is '{next_word}'")

The next word after 'i' is 'love'


In [None]:
import random

class BigramLanguageModel:
    def __init__(self):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.total_counts = defaultdict(int)

    def train(self, sequences):
        for sequence in sequences:
            for i in range(len(sequence) - 1):
                current_word = sequence[i]
                next_word = sequence[i + 1]
                self.bigram_counts[current_word][next_word] += 1
                self.total_counts[current_word] += 1

    def predict_next_word(self, current_word):
        if current_word not in self.bigram_counts:
            return None
        next_words = self.bigram_counts[current_word]
        return max(next_words, key=next_words.get)

    def generate_text(self, start_word, max_length=10):
        current_word = start_word
        text = [current_word]
        for _ in range(max_length - 1):
            next_word_index = self.predict_next_word(current_word)
            if next_word_index is None:
                break
            next_word = next_word_index
            text.append(next_word)
            current_word = next_word
        return text

In [None]:
def chat(model, tokenizer, input_text, max_length=10):
    tokens = tokenizer.tokenize(input_text)
    sequences = tokenizer.texts_to_sequences([input_text])
    if not sequences[0]:
        return "Sorry, I didn't understand that."
    start_word = sequences[0][-1]
    generated_sequence = model.generate_text(start_word, max_length)
    response = tokenizer.sequences_to_texts([generated_sequence])
    return response[0]

texts = [
    "I love machine learning",
    "Machine learning is fun",
    "fun genshin impact",
    "I love coding",
    "Coding is great"
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

model = BigramLanguageModel()
model.train(sequences)

user_input = "I"
response = chat(model, tokenizer, user_input)
print(f"User: {user_input}")
print(f"Bot: {response}")

User: I
Bot: i love machine learning is fun genshin impact
