In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from keras.utils import pad_sequences


In [2]:
# Load the dataset
df = pd.read_csv('C:/Users/User/MLProjectTrails/grocery_ratings.csv')
# Sort by date to ensure chronological order
df['date'] = pd.to_datetime(df['date'], unit='s')
df.sort_values(by=['reviewerID', 'date'], inplace=True)

In [3]:
def process_data(df,seqlen):
    user_groups = df.groupby('reviewerID')
    X_train1, y_train1 = [], []
    i = 0
    for user, user_data in user_groups:
        user_data = user_data.sort_values(by='date')
        
        # Check if the user has more than 5 interactions
        if len(user_data) > seqlen:
            i += 1
            product_sequence = user_data['product_id'].values
            rating_sequence = user_data['rating'].values
            
            # Append first 5 interactions to X_train
            X_train1.append((product_sequence[:seqlen], rating_sequence[:seqlen]))
            
            # Append remaining interactions to y_train
            y_train1.append((product_sequence[seqlen:], rating_sequence[seqlen:]))
        
        # Print progress every 100 users
        if i % 10000 == 0:
            print(f"Processed {i} users")
    return X_train1, y_train1

In [4]:
# from sklearn.model_selection import train_test_split
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
from sklearn.model_selection import GroupShuffleSplit

# Initialize the GroupShuffleSplit with a test size of 20% and random state for reproducibility
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)

# Get the indices for train and test sets, grouping by 'reviewerID'
train_idx, test_idx = next(gss.split(df, groups=df['reviewerID']))

# Create train and test sets based on the indices
train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

# Verify the split: check that there is no overlap in reviewerIDs
train_reviewers = set(train_df['reviewerID'])
test_reviewers = set(test_df['reviewerID'])

# Ensure there is no overlap
assert len(train_reviewers.intersection(test_reviewers)) == 0, "There is overlap between train and test reviewerIDs!"

In [6]:
X_train, y_train = process_data(train_df,2)

Processed 10000 users
Processed 20000 users
Processed 30000 users
Processed 40000 users
Processed 50000 users
Processed 60000 users
Processed 70000 users
Processed 80000 users
Processed 90000 users
Processed 100000 users
Processed 110000 users
Processed 120000 users


In [7]:
product_ids = df['product_id'].unique()
product_id_map = {product_id: i+1 for i, product_id in enumerate(product_ids)}

In [8]:
def finallprocess(X,y,seqlen):
    #Convert product sequences in X to integer sequences
    # as we require int in pad_sequences
    X_mapped = [[product_id_map[prod] for prod in x[0]] for x in X]
    
    #Pad sequences to a length of seqlen
    X_padded = pad_sequences(X_mapped, maxlen=seqlen)
    
    # Convert y to numpy array too
    y = np.array([y[1][0] for y in y], dtype=np.float32)
    return X_padded, y

In [9]:
X_train_padded,y_train  = finallprocess(X_train, y_train,2)

In [10]:
# Build the model
model = Sequential()
# model.add(Embedding(input_dim=len(product_id_map) + 1, output_dim=50, input_length=5))  # +1 for padding index
model.add(Embedding(input_dim=len(product_id_map) + 1, output_dim=50, input_length=2))
model.add(LSTM(128, activation='relu'))
model.add(Dense(1))  # For regression (predicting rating)

# model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.compile(optimizer='nadam', loss=tensorflow.keras.losses.MeanSquaredError(), metrics=['mae', tensorflow.keras.metrics.RootMeanSquaredError()])

# Train the model
model.fit(X_train_padded, y_train, epochs=8, batch_size=32)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x1e942284fd0>

In [20]:
X_test, y_test = process_data(test_df,2)

Processed 10000 users
Processed 20000 users
Processed 30000 users


In [21]:
X_test_padded,y_test  = finallprocess(X_test, y_test,2)

In [22]:
predicted_ratings = model.predict(X_test_padded)

mae = mean_absolute_error(y_test,predicted_ratings)
print(mae)


0.91787994


In [23]:
rmse = mean_squared_error(y_test, predicted_ratings, squared=False)
print("RMSE:", rmse)

RMSE: 1.2851907


In [26]:
test_df.shape

(296604, 4)

In [27]:
train_df.shape

(1188893, 4)

In [28]:
test_df['reviewerID'].unique().shape

(32261,)

In [29]:
train_df['reviewerID'].unique().shape

(129041,)

In [30]:
df['reviewerID'].unique().shape

(161302,)