In [1]:
import pandas as pd

df = pd.read_csv('../syllable_data.csv')

# Remove rows where 'word' or 'syllable_count' is NaN
df = df.dropna(subset=['word', 'syllable_count'])

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 370102 entries, 0 to 370103
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   word            370102 non-null  object
 1   syllable_count  370102 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 8.5+ MB


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Feature extraction function
def extract_features(word):
    return [
        len(word),                         # Length of the word
        sum(1 for char in word if char in 'aeiouy'),  # Count of vowels
        sum(1 for char in word if char not in 'aeiouy')  # Count of consonants
    ]

# Prepare features and labels
X = np.array([extract_features(word) for word in df['word']])
y = np.array(df["syllable_count"])


In [5]:

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a neural network model
model = MLPRegressor(
    activation='tanh',
    alpha=0.001,
    hidden_layer_sizes=(100,),
    learning_rate='adaptive',
    max_iter=2000,
    solver='lbfgs'
)
model.fit(X_train, y_train)

# Test model on unseen data
predictions = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [3]:
# Define the hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(10,), (50,), (100,), (50, 50)],   # Different layer sizes
    'activation': ['relu', 'tanh'],  # Activation functions to test
    'solver': ['adam', 'lbfgs'],     # Solvers to test
    'learning_rate': ['constant', 'adaptive'],  # Learning rate strategies
    'max_iter': [1000, 2000],        # Maximum iterations
    'alpha': [0.0001, 0.001],        # Regularization parameter
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters found:", grid_search.best_params_)


Fitting 3 folds for each of 128 candidates, totalling 384 fits
Best parameters found: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 2000, 'solver': 'lbfgs'}


In [4]:

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error on Test Set: {mse}")

Mean Squared Error on Test Set: 0.28939086719730184


In [8]:
import pickle

# Save the model to a file
with open('syllable_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# To load the model later
# model = joblib.load('syllable_model.pkl')

In [6]:
# Example of using the saved model to predict syllables for new words
new_words = ['programming', 'science', 'AI']

for word in new_words:
    features = np.array([extract_features(word)])
    syllables = model.predict(features)
    print(f"Word: {word}, Predicted Syllables: {round(syllables[0])}")


Word: programming, Predicted Syllables: 3
Word: science, Predicted Syllables: 2
Word: AI, Predicted Syllables: 1
