<a href="https://colab.research.google.com/github/Mmian0125/ai/blob/main/tweet_disaster_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
from transformers import BertTokenizer,BertForSequenceClassification
import torch
import os
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [4]:
model_checkpoint="bert-base-uncased"
model_root="/content/drive/MyDrive/tweet_disaster_model"
data_root="/content/drive/MyDrive/tweet_disaster_dataset"
train_path=os.path.join(data_root,"train.csv")
test_path=os.path.join(data_root,"test.csv")
submission_path=os.path.join(data_root,"sample_submission.csv")

In [7]:
train_csv=pd.read_csv(train_path)
test_csv=pd.read_csv(test_path)
print(train_csv[1:100])

     id   keyword              location  \
1     4       NaN                   NaN   
2     5       NaN                   NaN   
3     6       NaN                   NaN   
4     7       NaN                   NaN   
5     8       NaN                   NaN   
..  ...       ...                   ...   
95  137  accident             Charlotte   
96  138  accident       Baton Rouge, LA   
97  139  accident        Hagerstown, MD   
98  141  accident  Gloucestershire , UK   
99  143  accident                   NaN   

                                                 text  target  
1              Forest fire near La Ronge Sask. Canada       1  
2   All residents asked to 'shelter in place' are ...       1  
3   13,000 people receive #wildfires evacuation or...       1  
4   Just got sent this photo from Ruby #Alaska as ...       1  
5   #RockyFire Update => California Hwy. 20 closed...       1  
..                                                ...     ...  
95  9 Mile backup on I-77 South...a

In [10]:
class MyDataset(Dataset):
    def __init__(self, data_root, model_checkpoint,is_train=True):
        super().__init__()
        self.is_train=is_train
        data_path=os.path.join(data_root,"train.csv") if is_train==True else os.path.join(data_root,"test.csv")
        self.df_data=pd.read_csv(data_path)
        self.tokenizer=BertTokenizer.from_pretrained(model_checkpoint)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self,index):
        text=self.df_data['text'][index]
        if self.is_train == True:
            label=self.df_data['target'][index]
        token=self.tokenizer(text=text, padding='max_length', max_length=128, truncation=True, return_tensors='pt')
        input_ids, attention_mask, token_type_ids=token['input_ids'].squeeze(0), token['attention_mask'].squeeze(0), token['token_type_ids'].squeeze(0)

        return (input_ids,attention_mask,token_type_ids, label) if self.is_train==True else (input_ids,attention_mask,token_type_ids)

In [11]:
train_dataset=MyDataset(data_root,model_checkpoint,True)
test_dataset=MyDataset(data_root,model_checkpoint,False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
class Params:
    batch_size=16
    shuffle=True
    num_workers=0
    lr=2e-5
    weight_decay=0.01
    epochs=25

In [26]:
train_iter=DataLoader(dataset=train_dataset,batch_size=Params.batch_size, shuffle=Params.shuffle, num_workers=Params.num_workers)
test_iter=DataLoader(dataset=test_dataset,batch_size=Params.batch_size, shuffle=False, num_workers=Params.num_workers)

In [15]:
model=BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
loss=nn.CrossEntropyLoss()
optimizer=torch.optim.AdamW(model.parameters(),lr=Params.lr,weight_decay=Params.weight_decay )

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
model=model.to(device=device)
if torch.cuda.device_count()>1:
    model=nn.DataParallel(model)
loss=loss.to(device=device)

In [21]:
for epoch in range(Params.epochs):
    process=tqdm(train_iter)
    total_loss=0
    it=0
    for iter in process:
        data=(iter[0].to(device),iter[1].to(device),iter[2].to(device))
        label=iter[3].to(device)
        output=model(*data)
        optimizer.zero_grad()
        ls=loss(output.logits,label)

        ls.backward()
        optimizer.step()
        total_loss+=ls.item()
        it+=1
        process.set_description(f"epoch{epoch+1},loss:{ls.item()}")
    process.set_description(f"epoch:{epoch+1},total loss:{total_loss/it}")
    process.close()
    model.save_pretrained(model_root)





epoch1,loss:0.1965719610452652: 100%|██████████| 476/476 [00:44<00:00, 10.77it/s]
epoch2,loss:0.07979816943407059: 100%|██████████| 476/476 [00:44<00:00, 10.77it/s]
epoch3,loss:0.4421904981136322: 100%|██████████| 476/476 [00:44<00:00, 10.75it/s]
epoch4,loss:0.055121175944805145: 100%|██████████| 476/476 [00:44<00:00, 10.75it/s]
epoch5,loss:0.01941712573170662: 100%|██████████| 476/476 [00:44<00:00, 10.73it/s]
epoch6,loss:0.005475956480950117: 100%|██████████| 476/476 [00:44<00:00, 10.73it/s]
epoch7,loss:0.0014670147793367505: 100%|██████████| 476/476 [00:44<00:00, 10.73it/s]
epoch8,loss:0.0004382683546282351: 100%|██████████| 476/476 [00:44<00:00, 10.73it/s]
epoch9,loss:0.002103433245792985: 100%|██████████| 476/476 [00:44<00:00, 10.74it/s]
epoch10,loss:0.14707322418689728: 100%|██████████| 476/476 [00:44<00:00, 10.74it/s]
epoch11,loss:0.00019538929336704314: 100%|██████████| 476/476 [00:44<00:00, 10.74it/s]
epoch12,loss:0.001823028433136642: 100%|██████████| 476/476 [00:44<00:00, 10.

In [None]:
model.save_pretrained("/content/drive/MyDrive/tweet_disaster_model")

In [30]:
model=BertForSequenceClassification.from_pretrained(model_root)
model=model.to(device=device)

In [22]:
classifier_weight = model.classifier.weight
classifier_bias = model.classifier.bias
print(classifier_weight)
print(classifier_bias)

Parameter containing:
tensor([[-0.0257,  0.0033,  0.0331,  ..., -0.0167, -0.0296,  0.0048],
        [-0.0163,  0.0057, -0.0162,  ..., -0.0047,  0.0095,  0.0246]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([0., 0.], device='cuda:0', requires_grad=True)


In [35]:
model.eval()
df={"id":[], "target":[]}
test_csv=pd.read_csv(test_path)
with torch.no_grad():
  for i, data in enumerate(test_iter):
    data=(data[0].to(device), data[1].to(device), data[2].to(device))
    output=model(*data)
    pred=torch.argmax(output.logits,dim=1).cpu().numpy() #第0维是batchsize
    for idx in range(pred.shape[0]): #batchsize
      id=test_csv["id"][i*Params.batch_size+idx] #i：第i批数据，idx:第i批数据中的第idx个
      df['id'].append(id)
      df['target'].append(pred[idx])
pd.DataFrame(df).to_csv(submission_path,index=False) #index=False表示保存csv文件时，不把索引写进文件中


In [34]:
#print(test_csv)
submission=pd.read_csv(submission_path)
submission[-100:-1]

Unnamed: 0,id,target
3163,10495,1
3164,10497,1
3165,10501,1
3166,10504,0
3167,10507,1
...,...,...
3257,10858,1
3258,10861,0
3259,10865,1
3260,10868,1
