<a href="https://colab.research.google.com/github/HarryLatThinkCerca/ExploreData/blob/main/metals-model1-regression_jenny_march_21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model1 Regression Version 5
- updated on 3/21 by Jenny Song
<br></br>
- validation evaluation for each rurbic item at each epoch: 
    - confusion matrix 
    - agreement accuracy
<br></br>
- regression 
    - mse loss
<br></br>
- model:
    - bert 
<br></br>
- text features:
    - text-length
 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/thinkcerca'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/thinkcerca/group_1_final_writing_for_lesson_26025.xlsx - Result 1.csv
/kaggle/input/thinkcerca/Group_1_all_rubric_subscores_26025_14757_scores.xlsx - Result 1.csv


In [None]:
group1_final_writing = pd.read_csv('/kaggle/input/thinkcerca/group_1_final_writing_for_lesson_26025.xlsx - Result 1.csv',skipinitialspace=True)
group1_rubric_score = pd.read_csv('/kaggle/input/thinkcerca/Group_1_all_rubric_subscores_26025_14757_scores.xlsx - Result 1.csv',skipinitialspace=True)
                                   
group1_final_writing = group1_final_writing[['sa_id','sr_final_writing']].dropna()
group1_rubric_score = group1_rubric_score[['sa_id','a_rubric_id','rc_id','ai_score']].dropna()


In [None]:
# 0.15	rubric.categories.standard.first	Claim (Central Idea)
# 0.1	rubric.categories.standard.second	Reasons (Support for Central Idea)
# 0.2	rubric.categories.standard.third	Evidence (Facts and Details)
# 0.2	rubric.categories.standard.fourth	Reasoning (Explanation & Analysis)
# 0.25	rubric.categories.standard.fifth	Organization
# 0.05	rubric.categories.standard.sixth	Audience Appropriate Language (Style)
# 0.05	rubric.categories.standard.seventh	Conventions Of English

In [None]:
import collections 
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchvision.datasets import MNIST
from torchvision import transforms
import pytorch_lightning as pl
from transformers import BertTokenizer, BertModel
from torchmetrics import ConfusionMatrix



class WritingDataset(torch.utils.data.Dataset):
    def __init__(self):
        data = collections.defaultdict(dict)
        rubric_weight = np.array([0.15, 0.1, 0.2, 0.2, 0.25, 0.05, 0.05])
        for _,row in group1_final_writing.iterrows():
            sa_id = row['sa_id']
            data[sa_id]['final_writing'] = row['sr_final_writing']
            data[sa_id]['scores'] = [-1]*8

        for _,row in group1_rubric_score.iterrows():
            sa_id = row['sa_id']
            if(not sa_id in data):
                continue
            index = row['rc_id']-1143
            data[sa_id]['scores'][index]=row['ai_score']

        to_drop = []
        for key, val in data.items():
            score_cur = val['scores']
            score_cur_array = np.array(score_cur)

            score_cur[-1]=sum(rubric_weight*score_cur_array[:-1])
            if -1 in score_cur:
                to_drop.append(key)
                continue

        for k in to_drop:
            del data[k]
        self.data = [v for v in data.values()]
        

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]['final_writing'], torch.tensor(self.data[idx]['scores'])
        

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

class Trainer(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.regressor = nn.Sequential(
              nn.Linear(768, 250),
              nn.ReLU(),
              nn.Linear(250, 8)
        )
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.nlp = self.text_feature_extraction
        
    def text_feature_extraction(self, text):
        with torch.no_grad():
            inputs = self.tokenizer(text, return_tensors="pt",padding=True,truncation=True,max_length=512)
            for key in inputs:
                inputs[key] = inputs[key].cuda()
            outputs = self.bert(**inputs)
            # outputs.last_hidden_state B x H x 768
            return outputs.last_hidden_state.mean(dim=1) # B x 768
        
    def forward(self, text):
        text_length = [len(t) for t in text] # B
        text_feature = self.nlp(text) # B x 768
#         return self.regressor(torch.cat([text_feature, torch.tensor(text_length).unsqueeze(-1).cuda()], dim=1))
        return self.regressor(text_feature)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    
    def training_step(self, train_batch, batch_idx):
        sentence, scores = train_batch
        prediction = self.forward(sentence)
        loss = F.mse_loss(prediction, scores.float())
#         l1_loss = F.l1_loss(prediction, scores.float())
        
        self.log('train_mse_loss', loss, batch_size=len(sentence))
#         self.log('train_mae_loss', l1_loss, batch_size=len(sentence))
        return loss

    def validation_step(self, val_batch, batch_idx):
        sentence, scores = val_batch
        prediction = self.forward(sentence)
        
        loss = F.mse_loss(prediction, scores.float())
        
        scores_int = torch.round(scores).int()
        prediction_int = torch.round(prediction).int()
        
        prediction_int[prediction_int<1]=0
        prediction_int[prediction_int>5]=5
        
        
        self.log('val_acc', (prediction_int[:,:-1] == scores_int[:,:-1]).float().mean(), batch_size=len(sentence))
        
        return {"scores": scores, "pred": prediction}
    
    def validation_epoch_end(self, validation_step_outputs):
        scores_list = []
        prediction_list = []
        for step in validation_step_outputs:
            s = step["scores"]
            scores_list.append(s)
            p = step["pred"]
            prediction_list.append(p)
        scores = torch.cat(scores_list)
        prediction = torch.cat(prediction_list)
      
            
        scores_int = torch.round(scores).int()
        prediction_int = torch.round(prediction).int()
        
        prediction_int[prediction_int<1]=0
        prediction_int[prediction_int>5]=5
        
        
        for i in range(7):
            confusion_matrix = ConfusionMatrix(num_classes=6)(prediction_int[:,i].cpu(), scores_int[:,i].cpu())
#             print(f"confusion matrix {i}", confusion_matrix.numpy())
            
            df_cm = pd.DataFrame(confusion_matrix.numpy(), index = range(6), columns=range(6))
            plt.figure(figsize = (10,7))
            fig_ = sns.heatmap(df_cm, annot=True, cmap='Blues').get_figure()
            plt.close(fig_)

            self.logger.experiment.add_figure(f"Confusion matrix {i}", fig_, self.current_epoch)
        
        
        acc_per_rubric_item = torch.mean((prediction_int == scores_int).float(), dim=0)
        

        
        self.log('val_Claim_acc', acc_per_rubric_item[0])
        self.log('val_Reasons_acc', acc_per_rubric_item[1])
        self.log('val_Evidence_acc', acc_per_rubric_item[2])
        self.log('val_Reasoning_acc', acc_per_rubric_item[3])
        self.log('val_Organization_acc', acc_per_rubric_item[4])
        self.log('val_Audience_acc', acc_per_rubric_item[5])

#         self.log('val_English_acc', acc_per_rubric_item[6])
        
        
#         l1_loss_per_rubric_item = torch.mean(abs(prediction - scores), dim=0)
#         l2_loss_per_rubric_item = torch.mean((prediction - scores)**2, dim=0)

        
#         self.log('val_Claim_mae_loss', l1_loss_per_rubric_item[0])
#         self.log('val_Reasons_mae_loss', l1_loss_per_rubric_item[1])
#         self.log('val_Evidence_mae_loss', l1_loss_per_rubric_item[2])
#         self.log('val_Reasoning_mae_loss', l1_loss_per_rubric_item[3])
#         self.log('val_Organization_mae_loss', l1_loss_per_rubric_item[4])
#         self.log('val_Audience_mae_loss', l1_loss_per_rubric_item[5])
#         self.log('val_English_mae_loss', l1_loss_per_rubric_item[6])
#         self.log('val_Overall_mae_loss', l1_loss_per_rubric_item[7])

        

In [None]:
pl.utilities.seed.seed_everything(seed=1)
        
# data
dataset = WritingDataset()
mnist_train, mnist_val = random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])

train_loader = DataLoader(mnist_train, batch_size=32, shuffle=True)
val_loader = DataLoader(mnist_val, batch_size=64, shuffle=False)

# model
model = Trainer()

checkpoint_callback = pl.callbacks.model_checkpoint.ModelCheckpoint(
    monitor="val_acc",
    dirpath="checkpoints/",
    filename="cerca-rubric-agreement-acc-{epoch:02d}-{val_acc:.2f}",
    save_top_k=2,
    mode="max",
)

# training
trainer = pl.Trainer(gpus=1, precision=16, max_epochs=20,callbacks=[checkpoint_callback])
trainer.fit(model, train_loader, val_loader)
    


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Validation sanity check: 0it [00:00, ?it/s]

  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
#given a model and text, evlauate scores 

def score(text, model):
    model = model.eval().cuda()
    return model(text)


In [None]:
i = 24
t, s = mnist_val[i]
print(t)
print(s)

In [None]:
loaded_model = Trainer.load_from_checkpoint('checkpoints/cerca-rubric-with-textlength-epoch=17-val_mse_loss=0.77.ckpt')

score([t], loaded_model)

In [None]:
# trainer.save_checkpoint("model1_50e.ckpt")
# new_model = MyModel.load_from_checkpoint(checkpoint_path="example.ckpt")

# visualization

In [None]:
word_count, text_lengths, scores, scores_pred = [], [], [], []
from tqdm.notebook import tqdm
for t, s in tqdm(mnist_val):
    text_lengths.append(len(t))
    word_count.append(len(t.split()))
    scores.append(s[-1].item())
    scores_pred.append(score([t], loaded_model)[0][-1].item())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
plt.hist(text_lengths, bins=50)
pass

In [None]:
plt.hist(scores, bins=5)
pass

In [None]:
plt.hist(word_count, bins=50)
pass

In [None]:
#scatter plot of final score vs log text_length for validation set
plt.scatter(np.log(text_lengths), scores)

In [None]:
#scatter plot of predicted final score vs log text_length for validation set
plt.scatter(np.log(text_lengths), scores_pred)

# Ignore below

In [None]:
ls "checkpoints/"

'cerca-rubric-agreement-acc-epoch=13-val_acc=0.40.ckpt'
'cerca-rubric-agreement-acc-epoch=15-val_acc=0.41.ckpt'


In [None]:
ls "lightning_logs"

[0m[01;34mversion_0[0m/


In [None]:
cp checkpoints/* .

In [None]:
rm -rf checkpoints/

In [None]:
cp lightning_logs/version_0/* .

In [None]:
rm -rf lightning_logs/

In [None]:
rm *