In [4]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/11.1 MB 9.6 MB/s eta 0:00:02
   -------------- ------------------------- 3.9/11.1 MB 11.2 MB/s eta 0:00:01
   ---------------------- ----------------- 6.3/11.1 MB 11.4 MB/s eta 0:00:01
   ------------------------------- -------- 8.7/11.1 MB 11.7 MB/s eta 0:00:01
   ---------------------------------------  11.0/11.1 MB 11.7 MB/s eta 0:00:01
   -----------------------------

In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import joblib

In [73]:
df = pd.read_csv('C:\\Users\\i7\\Desktop\\archive\\recommended_plans_dataset.csv')

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   type              1000 non-null   object 
 1   Pay cycle         1000 non-null   object 
 2   client budget     1000 non-null   float64
 3   modules needed    947 non-null    object 
 4   recommended plan  1000 non-null   object 
dtypes: float64(1), object(4)
memory usage: 39.2+ KB


In [75]:
df.columns

Index(['type', 'Pay cycle', 'client budget', 'modules needed',
       'recommended plan'],
      dtype='object')

In [76]:
df.head()

Unnamed: 0,type,Pay cycle,client budget,modules needed,recommended plan
0,hosting,monthly,10.98,"wordpress, SSL",Premium wordpress hosting month subscription
1,custom,monthly,17.21,"SMTP, IMAP",Professional Email Hosting monthly
2,domain,monthly,22.1,"WHOIS Privacy, DNS",Domain & privacy bundle monthly
3,hosting,monthly,17.35,"wordpress, SSL",Premium wordpress hosting month subscription
4,license,yearly,714.64,API access,Project Manager SaaS-Team year


In [77]:
df['recommended plan'].unique()

array(['Premium wordpress hosting month subscription',
       'Professional Email Hosting monthly',
       'Domain & privacy bundle monthly',
       'Project Manager SaaS-Team year', 'Domain & privacy bundle yearly',
       'Professional Email Hosting yearly',
       'starter web hosting year subscription',
       'starter web hosting month subscription',
       'Premium wordpress hosting year subscription',
       'Project Manager SaaS-Team month'], dtype=object)

In [80]:
df.shape

(947, 5)

In [79]:
df = df.dropna()

In [61]:
df.shape

(947, 5)

In [40]:
!pip install pandas scikit-learn tensorflow




In [81]:
df["modules needed"] = df["modules needed"].fillna("unknown")
df["modules needed"] = df["modules needed"].apply(lambda x: [i.strip() for i in x.split(",")])

In [82]:
mlb = MultiLabelBinarizer()
modules_encoded = mlb.fit_transform(df["modules needed"])
modules_df = pd.DataFrame(modules_encoded, columns=mlb.classes_)
df = df.drop("modules needed", axis=1)
df = pd.concat([df.reset_index(drop=True), modules_df], axis=1)

In [83]:
# Separate features and target
X = df.drop("recommended plan", axis=1)
y = df["recommended plan"]

In [84]:
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [85]:
# Define preprocessing
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
], remainder="passthrough")


In [86]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Inject noise into training data
X_train_noisy = X_train_preprocessed.copy()
mask = np.random.binomial(1, 0.1, X_train_noisy.shape).astype(bool)
X_train_noisy[mask] = 0


In [89]:
# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_noisy.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(y_train_encoded.shape[1], activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_noisy, y_train_encoded, epochs=30, validation_split=0.2, batch_size=32, verbose=1)


Epoch 1/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.1412 - loss: 2.2837 - val_accuracy: 0.6184 - val_loss: 2.0609
Epoch 2/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3503 - loss: 2.0700 - val_accuracy: 0.6974 - val_loss: 1.8286
Epoch 3/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5246 - loss: 1.8476 - val_accuracy: 0.8618 - val_loss: 1.5393
Epoch 4/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5955 - loss: 1.6087 - val_accuracy: 0.9079 - val_loss: 1.2284
Epoch 5/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6796 - loss: 1.3222 - val_accuracy: 0.9474 - val_loss: 0.9497
Epoch 6/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7629 - loss: 1.0800 - val_accuracy: 0.9474 - val_loss: 0.7226
Epoch 7/30
[1m19/19[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x246812cf8b0>

In [91]:
# Save everything
joblib.dump(preprocessor, "preprocessor.joblib")
joblib.dump(mlb, "mlb.joblib")
joblib.dump(encoder, "label_encoder.joblib")
model.save("fnn_model.keras")

In [92]:
model.input_shape  # e.g., (None, 45)


(None, 15)