In [16]:
import pandas as pd

df = pd.read_csv('../syllable_data.csv')

# Remove rows where 'word' or 'syllable_count' is NaN
df = df.dropna(subset=['word', 'syllable_count'])

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 370102 entries, 0 to 370103
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   word            370102 non-null  object
 1   syllable_count  370102 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 8.5+ MB


In [19]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

# Feature extraction function
def extract_features(word):
    return [
        len(word),                         # Length of the word
        sum(1 for char in word if char in 'aeiouy'),  # Count of vowels
        sum(1 for char in word if char not in 'aeiouy')  # Count of consonants
    ]

# Prepare features and labels
X = np.array([extract_features(word) for word in df['word']])
y = np.array(df["syllable_count"])

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a neural network model
model = MLPRegressor(hidden_layer_sizes=(10,), max_iter=1000)
model.fit(X_train, y_train)

# Test model on unseen data
predictions = model.predict(X_test)

Predicted: 3, Actual: 3
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 3, Actual: 2
Predicted: 3, Actual: 3
Predicted: 1, Actual: 2
Predicted: 4, Actual: 3
Predicted: 3, Actual: 3
Predicted: 3, Actual: 2
Predicted: 3, Actual: 3
Predicted: 5, Actual: 5
Predicted: 3, Actual: 3
Predicted: 2, Actual: 3
Predicted: 4, Actual: 4
Predicted: 4, Actual: 5
Predicted: 2, Actual: 2
Predicted: 2, Actual: 1
Predicted: 5, Actual: 5
Predicted: 3, Actual: 3
Predicted: 2, Actual: 2
Predicted: 4, Actual: 4
Predicted: 1, Actual: 1
Predicted: 3, Actual: 3
Predicted: 4, Actual: 4
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 5, Actual: 5
Predicted: 2, Actual: 3
Predicted: 4, Actual: 4
Predicted: 3, Actual: 2
Predicted: 3, Actual: 3
Predicted: 2, Actual: 2
Predicted: 3, Actual: 3
Predicted: 2, Actual: 3
Predicted: 3, Actual: 2
Predicted: 3, Actual: 3
Predicted: 4, Actual: 4
Predicted: 6, Actual: 6
Predicted: 1, Actual: 1
Predicted: 3, Actual: 3
Predicted: 3, Actual: 3
Predicted: 3, Ac

In [None]:
import pickle

# Save the model to a file
with open('syllable_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# To load the model later
# model = joblib.load('syllable_model.pkl')

In [None]:
# Example of using the saved model to predict syllables for new words
new_words = ['programming', 'science', 'AI']

for word in new_words:
    features = np.array([extract_features(word)])
    syllables = model.predict(features)
    print(f"Word: {word}, Predicted Syllables: {round(syllables[0])}")
