In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://en.wikipedia.org/wiki/Sheng_slang#Etymology_and_history"
response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

sheng_table = soup.find("table", {"class": "wikitable"})
sheng_data = []

for row in sheng_table.find_all("tr"):
    cols = row.find_all("td")
    if len(cols) == 2:
        sheng_word = cols[0].text.strip()
        english_translation = cols[1].text.strip()
        sheng_data.append((sheng_word, english_translation))

print(sheng_data)

In [None]:
sheng_words = []
english_translations = []

for sheng_word, english_translation in sheng_data:
    sheng_words.append(sheng_word)
    english_translations.append(english_translation)

labeled_data = list(zip(sheng_words, english_translations))


In [None]:
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = text.split()
    # Convert everything to lowercase
    tokens = [token.lower() for token in tokens]
    return " ".join(tokens)

In [None]:
import string
# Preprocess the Sheng and English translations
preprocessed_data = []
for sheng_word, english_translation in labeled_data:
    preprocessed_sheng_word = preprocess_text(sheng_word)
    preprocessed_english_translation = preprocess_text(english_translation)
    preprocessed_data.append((preprocessed_sheng_word, preprocessed_english_translation))

In [None]:
from sklearn.model_selection import train_test_split
# Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    [sheng_word for sheng_word, english_translation in preprocessed_data],
    [english_translation for sheng_word, english_translation in preprocessed_data],
    test_size=0.2,
    random_state=42,
)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Extract features from the preprocessed data using a bag-of-words model
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
# Train a Naive Bayes classifier on the training data
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
# Evaluate the classifier on the testing data
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Deploy the classifier by wrapping it in a function
def translate_sheng_to_english(sheng_word):
    preprocessed_sheng_word = preprocess_text(sheng_word)
    vectorized_sheng_word = vectorizer.transform([preprocessed_sheng_word])
    english_translation = classifier.predict(vectorized_sheng_word)[0]
    return english_translation

In [None]:
# Prompt the user to input a Sheng word and translate it to English
while True:
    sheng_word = input("Enter a Sheng word (or 'quit' to exit): ")
    if sheng_word.lower() == "quit":
        break
    english_translation = translate_sheng_to_english(sheng_word)
    print(f"{sheng_word} means '{english_translation}' in English.")