In [4]:
import torch
import json
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

def load_data(filename):
    with open(filename, 'r') as file:
        data = json.load(file)

    ratings = [item['rating'] for item in data]
    texts = [item['text'] for item in data]
    helpful_votes = [item['helpful_vote'] for item in data]
    verified_purchases = [item['verified_purchase'] for item in data]
    titles = [item['title'] for item in data]

    return ratings, texts, helpful_votes, verified_purchases, titles

def load_test_data(filename):
    with open(filename, 'r') as file:
        data = json.load(file)

    texts = [item['text'] for item in data]
    helpful_votes = [item['helpful_vote'] for item in data]
    verified_purchases = [item['verified_purchase'] for item in data]
    titles = [item['title'] for item in data]

    return texts, helpful_votes, verified_purchases, titles

def preprocess_data(texts, ratings, helpful_votes, verified_purchases, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    ratings = torch.tensor(ratings)  
    helpful_votes = torch.tensor(helpful_votes) 
    verified_purchases = torch.tensor([1 if vp else 0 for vp in verified_purchases]) 

    return input_ids, attention_masks, ratings, helpful_votes, verified_purchases

train_ratings, train_texts, train_helpful_votes, train_verified_purchases, train_titles = load_data('train.json')

test_texts, test_helpful_votes, test_verified_purchases, test_titles= load_test_data('test.json')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5) 

train_input_ids, train_attention_masks, train_ratings, train_helpful_votes, train_verified_purchases = preprocess_data(
    train_texts, train_ratings, train_helpful_votes, train_verified_purchases, tokenizer
)

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_ratings, train_helpful_votes, train_verified_purchases)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
loss_fn = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(3): 
    total_loss = 0
    count = 0
    for batch in train_loader:
        count += 1
        input_ids, attention_masks, labels, helpful_votes, verified_purchases = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        labels = labels - 1  
        labels = labels.long()  
        loss = loss_fn(logits, labels)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(total_loss / (count*16))
    print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader)}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.10017567873001099
0.09921937063336372
0.10029702136913936
0.10065323486924171
0.10085852593183517
0.10048254082600276
0.1009276243192809
0.10103102121502161
0.1013155256708463
0.10150677636265755
0.10100048577243631
0.10101575714846452
0.1010179938032077
0.10122972301074437
0.10132989237705867
0.10143430205062032
0.10122619744609385
0.1013249001569218
0.10133004894382075
0.10131327025592327
0.10125654545568284
0.10123817893591794
0.1012849924357041
0.10124614586432774
0.10109599322080612
0.10116082659134498
0.10117557700033541
0.10127756531749453
0.10113654388435955
0.1011369469265143
0.10114507160840495
0.10102364816702902
0.10089328794768362
0.10084113434833639
0.10100941785744258
0.10106786236994797
0.10098973460294106
0.1010677222358553
0.1009896592451976
0.10104407519102096
0.10101618508740169
0.10104117542505264
0.10106166122957717
0.10111749155277555
0.10112986266613007
0.10105103416287381
0.10115236804840412
0.1011641351506114
0.10115701568369963
0.10117505386471748
0.1010978

KeyboardInterrupt: 

In [5]:
def predict_and_save(test_texts, tokenizer, model, filename, test_helpful_votes, test_verified_purchases, batch_size=16):
    with open(filename, 'w') as file:
        file.write("index,rating\n")

        for i in range(0, len(test_texts), batch_size):
            print(i, len(test_texts))
            batch_texts = test_texts[i:i+batch_size]
            batch_helpful_votes = test_helpful_votes[i:i+batch_size]
            batch_verified_purchases = test_verified_purchases[i:i+batch_size]

            input_ids, attention_masks, _, _, _ = preprocess_data(batch_texts, [0]*len(batch_texts), batch_helpful_votes, batch_verified_purchases, tokenizer)

            model.eval()
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_masks)
                predicted_ratings = torch.argmax(outputs.logits, dim=1)

            for j, rating in enumerate(predicted_ratings.tolist()):
                file.write(f"index_{i+j},{rating}\n")

predict_and_save(test_texts, tokenizer, model, 'predictions.csv', test_helpful_votes, test_verified_purchases)


0 35000
16 35000
32 35000
48 35000
64 35000
80 35000
96 35000
112 35000
128 35000
144 35000
160 35000
176 35000
192 35000
208 35000
224 35000
240 35000
256 35000
272 35000
288 35000
304 35000
320 35000
336 35000
352 35000
368 35000
384 35000
400 35000
416 35000
432 35000
448 35000
464 35000
480 35000
496 35000
512 35000
528 35000
544 35000
560 35000
576 35000
592 35000
608 35000
624 35000
640 35000
656 35000
672 35000
688 35000
704 35000
720 35000
736 35000
752 35000
768 35000
784 35000
800 35000
816 35000
832 35000
848 35000
864 35000
880 35000
896 35000
912 35000
928 35000
944 35000
960 35000
976 35000
992 35000
1008 35000
1024 35000
1040 35000
1056 35000
1072 35000
1088 35000
1104 35000
1120 35000
1136 35000
1152 35000
1168 35000
1184 35000
1200 35000
1216 35000
1232 35000
1248 35000
1264 35000
1280 35000
1296 35000
1312 35000
1328 35000
1344 35000
1360 35000
1376 35000
1392 35000
1408 35000
1424 35000
1440 35000
1456 35000
1472 35000
1488 35000
1504 35000
1520 35000
1536 35000
1552

In [6]:
import csv

with open('/content/predictions.csv', 'r', newline='') as csvfile:
    reader = csv.reader(csvfile)
    rows = list(reader)

for row in rows:
    if row[1]=='rating':
      continue
    value = int(row[1])
    if value == 0:
        row[1] = '1'
    elif value == 1:
        row[1] = '2'
    elif value == 2:
        row[1] = '3'
    elif value == 3:
        row[1] = '4'
    elif value == 4:
        row[1] = '5'

with open('modified_csv_file.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(rows)


In [None]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def preprocess_data(data, with_rating=True):
    X = [(d['title'] + ' ' + d['text']) for d in data]
    if with_rating:
        y = [d['rating'] for d in data]
        y = torch.tensor(y, dtype=torch.float32)
        return X, y
    else:
        return X

def text_to_tensor(text, word_to_index, max_length=50):
    tensor = torch.zeros(max_length, dtype=torch.long)
    for i, word in enumerate(text.split()):
        if i >= max_length:
            break
        tensor[i] = word_to_index.get(word, 0)
    return tensor

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        embedded = self.embedding(input)
        lstm_out, _ = self.lstm(embedded.view(len(input), 1, -1))
        output = self.fc(lstm_out[-1])
        return output

def train_model(model, X_train, y_train, epochs=100, lr=0.001):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

train_data = load_data('train.json')
test_data = load_data('test.json')

X_train, y_train = preprocess_data(train_data)
X_test = preprocess_data(test_data, with_rating=False)

all_words = set(' '.join(X_train).split())
word_to_index = {word: i + 1 for i, word in enumerate(all_words)} 

input_size = len(word_to_index) + 1
hidden_size = 64  
output_size = 1  
model = LSTMModel(input_size, hidden_size, output_size)

X_train = [text_to_tensor(text, word_to_index) for text in X_train]
X_test = [text_to_tensor(text, word_to_index) for text in X_test]

train_model(model, X_train, y_train)

with torch.no_grad():
    predicted_ratings = model(X_test)
    predicted_ratings = predicted_ratings.squeeze().numpy()

print("Predicted ratings:", predicted_ratings)


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

In [None]:
import json
import pandas as pd
import numpy as np
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def load_test_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    texts = [item['title'] + ' ' + item['text'] for item in data]

    return texts

max_words = 1000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

model = load_model('ratings_prediction_model.h5')

def predict_ratings(model, test_texts):
    test_sequences = tokenizer.texts_to_sequences(test_texts)
    test_data = pad_sequences(test_sequences, maxlen=100)

    predictions = model.predict(test_data)
    predictions = np.round(predictions).astype(int).flatten().tolist()

    return predictions

def save_predictions_to_csv(predictions, output_file):
    results = []
    results.append(['index', 'rating']) 

    for i, rating in enumerate(predictions):
        results.append([f'index_{i}', min(5, max(1, rating))])

    df = pd.DataFrame(results[1:], columns=results[0]) 
    df.to_csv(output_file, index=False)

test_texts = load_test_data('test.json')

predictions = predict_ratings(model, test_texts)

save_predictions_to_csv(predictions, 'ratings.csv')


