In [None]:
import pandas as pd
import numpy as np
from keras import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import Input, LayerNormalization, MultiHeadAttention
import joblib


In [None]:

# Load the CSV file
file_path = 'Datasets/FedCycleData071012 (2).csv'
data = pd.read_csv(file_path)


In [None]:

# Remove columns with majority N/A values
threshold = 0.5
data_cleaned = data.loc[:, data.isnull().mean() < threshold]


In [None]:

# Fill missing values
for column in data_cleaned.columns:
    if data_cleaned[column].dtype == 'object':
        data_cleaned[column].fillna(data_cleaned[column].mode()[0], inplace=True)
    else:
        data_cleaned[column].fillna(data_cleaned[column].median(), inplace=True)

# Encode categorical variables
label_encoders = {}
for column in data_cleaned.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_cleaned[column] = le.fit_transform(data_cleaned[column])
    label_encoders[column] = le

# Normalize numerical features
scaler = StandardScaler()
numerical_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
data_cleaned[numerical_columns] = scaler.fit_transform(data_cleaned[numerical_columns])

In [None]:
# Define features and target
features = ['Age', 'BMI', 'MeanCycleLength', 'LengthofMenses', 'UnusualBleeding', 'MeanBleedingIntensity']
X = data_cleaned[features]
y = data_cleaned['LengthofCycle']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Models and evaluation
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'SVR': SVR(kernel='rbf'),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42),
    'KNN': KNeighborsRegressor(n_neighbors=5)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    print(f'{name} MAE: {mae}, RMSE: {rmse}')


In [None]:
# LSTM Model
def create_sequences(features, target, time_steps=5):
    X, y = [], []
    for i in range(len(features) - time_steps):
        X.append(features[i:i + time_steps])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)


features_scaled = scaler.fit_transform(X)
target_scaler = MinMaxScaler()
target_scaled = target_scaler.fit_transform(y.values.reshape(-1, 1))

In [None]:
X_seq, y_seq = create_sequences(features_scaled, target_scaled)

X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

In [None]:
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(units=50, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(units=1))

lstm_model.compile(optimizer='adam', loss='mean_squared_error')


In [None]:
history = lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

loss = lstm_model.evaluate(X_test, y_test)
print(f'LSTM Test Loss: {loss}')

y_pred = lstm_model.predict(X_test)
y_pred_rescaled = target_scaler.inverse_transform(y_pred)
y_test_rescaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))


In [None]:

plt.figure(figsize=(14, 5))
plt.plot(y_test_rescaled, label='Actual Cycle Length')
plt.plot(y_pred_rescaled, label='Predicted Cycle Length')
plt.title('LSTM Model Predictions vs Actual Values')
plt.xlabel('Time Step')
plt.ylabel('Cycle Length')
plt.legend()
plt.show()


In [None]:
# Transformer Model
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    return x + res

In [None]:

def build_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0, mlp_dropout=0):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
    x = LayerNormalization(epsilon=1e-6)(x)
    x = Dense(mlp_units, activation="relu")(x)
    x = Dropout(mlp_dropout)(x)
    x = Dense(mlp_units, activation="relu")(x)
    x = Dropout(mlp_dropout)(x)
    outputs = Dense(1, activation="linear")(x)
    return Model(inputs, outputs)


In [None]:
input_shape = (X_train.shape[1], X_train.shape[2])
lstm_model = build_model(input_shape, head_size=256, num_heads=4, ff_dim=4, num_transformer_blocks=4, mlp_units=128,
                         dropout=0.1, mlp_dropout=0.1)

lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.summary()
history = lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

loss = lstm_model.evaluate(X_test, y_test)
print(f'Transformer Test Loss: {loss}')

y_pred = lstm_model.predict(X_test)
y_pred_rescaled = target_scaler.inverse_transform(y_pred.reshape(-1, 1))
y_test_rescaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))

In [None]:
# Save models
for name, model in models.items():
    joblib.dump(model, f'../AIFinalProject/Models/{name}_model.pkl')

joblib.dump(scaler, 'Models/scaler.pkl')
joblib.dump(target_scaler, 'Models/target_scaler.pkl')
joblib.dump(label_encoders,'Models/label_encoders.pkl')
# Save LSTM lstm_model
lstm_model.save('../AIFinalProject/Models/lstm_model.keras')

print("Models and feature names saved successfully.")