In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense,Dropout
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
df=pd.read_csv('/content/drive/MyDrive/clothes.csv')

In [3]:
combined_df=df[0:290000]

In [4]:
combined_df['class'].value_counts()

0    145010
1    144990
Name: class, dtype: int64

In [5]:
X = combined_df['reviewText'].values
y = combined_df['class'].values

In [6]:
def preprocess_text(text):
    # Remove stopwords
    cleaned_text = ' '.join([word for word in text.split() if word.lower() not in ENGLISH_STOP_WORDS])
    # Clean the text further as needed (e.g., removing punctuation, lowering, stemming, etc.)
    cleaned_text = cleaned_text.lower()
    # Return cleaned text
    return cleaned_text

In [7]:
X = [preprocess_text(text) for text in X]

In [8]:
# Tokenization
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X= pad_sequences(X, maxlen=max_len)

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Define your model
embedding_dim = 50  # Dimension of the word embeddings
gru_units = 64  # Number of units in the GRU layer

model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1,output_dim=128, input_length=max_len),
    GRU(gru_units, return_sequences=True),
    GRU(gru_units),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          13190400  
                                                                 
 gru (GRU)                   (None, 100, 64)           37248     
                                                                 
 gru_1 (GRU)                 (None, 64)                24960     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 13256833 (50.57 MB)
Trainable params: 1325

In [13]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x794c8a7b2500>

In [14]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8573275804519653


In [15]:
x1=df['reviewText']
x1=x1[295070:295071]
x1

295070    I purchased these based on the reviews and des...
Name: reviewText, dtype: object

In [16]:
x1=[preprocess_text(text) for text in x1]
tokenizer.fit_on_texts(x1)
x1 = tokenizer.texts_to_sequences(x1)
x1 = pad_sequences(x1, maxlen=max_len)

In [17]:
predictions=model.predict(x1)
predictions



array([[0.0020034]], dtype=float32)

In [18]:
binary_predictions = [1 if pred > 0.5 else 0 for pred in predictions]

# Print predicted labels
print("Predicted Labels:", binary_predictions)

Predicted Labels: [0]


In [18]:
model.save('gru_model.h5')
# Download the saved model file from Google Colab to your local machine
from google.colab import files
files.download('gru_model.h5')