In [2]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import pandas as pd

In [3]:

#Separar el dataset en train y test (80% train, 20% test) Finaltrain.csv

df = pd.read_csv('data/Finaltrain.csv')
df = df.sample(frac=1).reset_index(drop=True)
train_data = df[:int(0.8*len(df))]
test_data = df[int(0.8*len(df)):]

train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)


In [4]:
train_data.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,text_length,word_count,number_count,punctuation_count,stopword_count
0,7434c9fa6e04,39c16e,It must involve a change in fortune from good ...,-0.639882,-1.382738,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to ...,173,33,0,3,13
1,5a77a1f31be5,3b9047,Different social classes are involved in the g...,2.72367,0.956654,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structure...,1825,304,0,55,125
2,84c15531acf0,814d6b,Because it was at a school and when something ...,-0.890893,0.215294,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave ex...,170,33,0,2,21
3,dde8cc7e479f,814d6b,"Once the Third Wave was created, more and more...",0.467952,2.353112,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave ex...,582,111,0,10,58
4,6fc1871b199d,3b9047,In ancient Egypt the system of governent w...,-0.39331,0.627128,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structure...,237,47,0,3,26


In [5]:
train_data['text'].tolist()

['It must involve a change in fortune from good to bad, the change in fortune must come from a great error, and the plot should focus on one single issue rather than multiple.',
 'Different social classes are involved in the government by contributing to the economy. To start off, the high-class a.k.a. the pharaohs they were there for leadership. For example, " Their leaders, called pharaohs, were believed to be gods in human form. They had absolute power over their subjects." Then the pharaohs have powerful people who are right under them, they are the visors. The visors kept track of everything happening in the kingdom and they knew how to read and write. For example, " Working with the vizier were scribes who kept government records. These high-level employees had mastered a rare skill in ancient Egypt — they could read and write." Next are the Nobles and the Priests, They were known for pleasing the gods. For example, " Only nobles could hold government posts; in these positions, t

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer.batch_encode_plus(
    train_data['text'].tolist(),
    truncation=True,
    padding=True
)

test_encodings = tokenizer.batch_encode_plus(
    test_data['text'].tolist(),
    truncation=True,
    padding=True
)

train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_data['content'].tolist()),
    torch.tensor(train_data['wording'].tolist())
)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask'])
)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
class BERTModel(nn.Module):
  def __init__(self):
      super(BERTModel, self).__init__()
      self.bert = BertModel.from_pretrained('bert-base-uncased')

      self.dropout = nn.Dropout(0.1)
      self.linear1 = nn.Linear(768, 256)
      self.linear2 = nn.Linear(256, 2)

  def forward(self, input_ids, attention_mask):
      outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
      pooled_output = outputs.pooler_output
      pooled_output = self.dropout(pooled_output)
      output = self.linear1(pooled_output)
      output = nn.ReLU()(output)
      output = self.linear2(output)
      return output

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model = BERTModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.MSELoss()

cuda


In [9]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

In [10]:
# Splitting training data into train and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

# Creating validation loader
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

In [11]:
# Training loop
model.train()
for epoch in range(3):
    running_loss = 0.0
    cont = 0
    for step, (input_ids, attention_mask, content, wording) in enumerate(train_loader):
        print(cont)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        content = content.to(device)
        wording = wording.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs[:, 0], content) + criterion(outputs[:, 1], wording)
        loss.backward()
        optimizer.step()
        if step % 10 == 0:
            print("Epoch {}, Step {}, Loss: {}".format(epoch+1, step, loss.item()))

        running_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {running_loss / len(train_loader)}")

    # Validation loop
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for val_step, (input_ids, attention_mask, content, wording) in enumerate(val_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            content = content.to(device)
            wording = wording.to(device)

            val_outputs = model(input_ids, attention_mask)
            val_loss += criterion(val_outputs[:, 0], content) + criterion(val_outputs[:, 1], wording)

        print(f"Validation Loss: {val_loss / len(val_loader)}")
    model.train()

0


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1, Step 0, Loss: 1.3684093952178955
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 10, Loss: 2.3249573707580566
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 20, Loss: 1.376314401626587
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 30, Loss: 2.511798858642578
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 40, Loss: 1.3290950059890747
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 50, Loss: 0.9070420265197754
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 60, Loss: 0.8700929880142212
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 70, Loss: 1.8549578189849854
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 80, Loss: 1.4519319534301758
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 90, Loss: 1.2197247743606567
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 100, Loss: 0.6549285650253296
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 110, Loss: 0.7331451177597046
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 120, Loss: 1.104358434677124
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 130, Loss: 0.6218050122261047
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 140, Loss: 0.4548178017139435
0
0
0
0
0
0
0
0
0
0
Epoch 1, Step 150, Loss: 0.5572052001953125
0
0
0
0
0


In [12]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

In [13]:
model.eval()
predictions = []
with torch.no_grad():
    for input_ids, attention_mask in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask)
        predictions.extend(outputs.cpu().numpy())

In [14]:
submission_df = pd.DataFrame({
    'student_id': test_data['student_id'],
    'content': [pred[0] for pred in predictions],
    'wording': [pred[1] for pred in predictions]
})

submission_df.to_csv('submission.csv', index=False)

In [15]:
submission_df

Unnamed: 0,student_id,content,wording
5732,9fe9daa0df1b,2.702838,2.071508
5733,527493fb8edd,-1.246794,-1.464631
5734,bb2151d5c8e3,0.398543,0.846880
5735,a1be154cb8fb,-0.598291,-0.381143
5736,f9ae7f23131d,-0.251131,-0.267918
...,...,...,...
7160,48a54d762dc6,-1.236920,-0.950904
7161,123842ec7d81,-1.211532,-1.397556
7162,1ba342966c3b,0.531721,-0.109286
7163,7c990d713bb5,0.894329,1.237045


In [23]:
from sklearn.metrics import mean_squared_error

mean_squared_error(test_data['content'], submission_df['content']), mean_squared_error(test_data['wording'], submission_df['wording'])
# calcular r2
from sklearn.metrics import r2_score

r2_score(test_data['content'], submission_df['content']), r2_score(test_data['wording'], submission_df['wording'])

print("MSE wording: ", mean_squared_error(test_data['wording'], submission_df['wording']))
print("MSE content: ", mean_squared_error(test_data['content'], submission_df['content']))
print("R2 wording: ", r2_score(test_data['wording'], submission_df['wording']))
print("R2 content: ", r2_score(test_data['content'], submission_df['content']))

MSE wording:  0.3833820349315146
MSE content:  0.23599822184882996
R2 wording:  0.6289597056557734
R2 content:  0.7808776999608971


In [24]:
torch.save(model.state_dict(), 'bert.pth')