<a href="https://colab.research.google.com/github/Ihimanshudhar/Automatic-Form-Filler/blob/main/neuralnetwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [100]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [35]:
train_data = pd.DataFrame({
    "review": [
         "I absolutely loved it!",
         "Terrible, would not recommend.",
         "It was okay, not the best but not the worst.",
         "Amazing film with superb acting.",
         "Disappointing, a waste of time.",
         "An outstanding masterpiece!",
         "Mediocre at best.",
         "A brilliant and engaging experience.",
         "Not my cup of tea.",
         "Simply fantastic."
    ],
    "label": [5.0, 1.0, 3.0, 4.5, 1.5, 5.0, 3.0, 4.0, 2.0, 5.0]
})
train_data.to_csv("train.csv", index=False)

# Step 2: Generate Sample Test Data
test_data = pd.DataFrame({
    "review": [
         "I absolutely loved it!",
         "I loved the cinematography.",
         "Worst experience ever.",
         "Not bad, could be better.",
         "Absolutely mind-blowing performance!",
         "I did not like it."
    ]
})
test_data.to_csv("test.csv", index=False)

In [94]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df["review"].fillna("", inplace=True)
test_df["review"].fillna("", inplace=True)

X = train_df["review"]
y = train_df["label"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df["review"].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["review"].fillna("", inplace=True)


In [95]:

# Normalize labels to range [0,1]
y = (y - 1) / 4.0


In [101]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_test_sequences = tokenizer.texts_to_sequences(test_df["review"])

In [102]:
max_length = max(len(seq) for seq in X_sequences)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')

In [103]:
sentences = [review.split() for review in X]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [112]:
models = {
    "model1": keras.Sequential([
        layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, weights=[embedding_matrix], input_length=max_length, trainable=False),
        layers.Conv1D(128, 5, activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.LSTM(64, return_sequences=True),
        layers.LSTM(32),
        layers.Dense(1, activation="tanh")
    ]),
    "model2": keras.Sequential([
        layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, weights=[embedding_matrix], input_length=max_length, trainable=False),
        layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(32)),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation="tanh")
    ]),
    "model3": keras.Sequential([
        layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, weights=[embedding_matrix], input_length=max_length, trainable=True),
        layers.Conv1D(64, 3, activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.LSTM(32, return_sequences=True),
        layers.LSTM(16),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation="tanh")
    ])
}



In [113]:
best_model = None
best_mae = float("inf")
optimizers = ["adam", "rmsprop", "sgd"]

In [114]:
for idx, (name, model) in enumerate(models.items()):
    model.compile(optimizer=optimizers[idx], loss="mse", metrics=["mae"])
    print(f"Training {name}...")
    model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_val, y_val), verbose=1)
    val_mae = model.evaluate(X_val, y_val, verbose=0)[1]
    print(f"Validation MAE for {name}: {val_mae}")

    if val_mae < best_mae:
        best_mae = val_mae
        best_model = model

print(f"Best model selected with Validation MAE: {best_mae}")

Training model1...
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 0.6053 - mae: 0.7186 - val_loss: 0.0287 - val_mae: 0.1245
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step - loss: 0.5915 - mae: 0.7089 - val_loss: 0.0265 - val_mae: 0.1243
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 0.5773 - mae: 0.6987 - val_loss: 0.0243 - val_mae: 0.1242
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - loss: 0.5620 - mae: 0.6876 - val_loss: 0.0221 - val_mae: 0.1240
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - loss: 0.5455 - mae: 0.6753 - val_loss: 0.0201 - val_mae: 0.1239
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step - loss: 0.5274 - mae: 0.6617 - val_loss: 0.0183 - val_mae: 0.1238
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - 

In [115]:
predictions = best_model.predict(X_test_padded)
predictions = (predictions * 4) + 1  # Convert back to range [1,5]

submission = pd.DataFrame({"review": test_df["review"], "rating": predictions.flatten()})
submission.to_csv("submission.csv", index=False)
print("Submission file generated: submission.csv")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 407ms/step
Submission file generated: submission.csv
