In [None]:
!pip install tensorflow_io
!pip install keras-tuner
!pip install print_schema
!pip install pydub
!pip install opensmile
!pip install tqdm boto3 requests regex sentencepiece sacremoses
!pip install transformers

In [2]:
import pandas as pd 
import numpy as np
import json
import re
from string import punctuation
from glob import glob
import tensorflow as tf
import collections
import bisect
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


###Import Files

In [4]:
## The path of files may need to be modified
pauses = {"short": ",", "medium": ".", "long": "..."}
label_map = {'CN': 0, 'AD': 1}
filenames = tf.io.gfile.glob("/content/gdrive/MyDrive/DementiaBank.20210603/ADReSSo2021/diagnosis/train/transcription/*.json")
filenames = tf.random.shuffle(filenames).numpy()
segmentations = tf.io.gfile.glob("/content/gdrive/MyDrive/DementiaBank.20210603/ADReSSo2021/diagnosis/train/segmentation/*/*.csv")

###Preprocessing for Segmentation

In [5]:
all_time_line = collections.defaultdict(list)
labels = collections.defaultdict(str)
ids = []
for segmentation in segmentations:
  seg_file = pd.read_csv(segmentation)
  seg_file = seg_file[seg_file["speaker"] == "PAR"]
  # print(segmentation)
  file_id = segmentation.split("/")[-1].split(".")[0]
  label = segmentation.split("/")[-2]
  ids.append(file_id)
  # print(seg_file["begin"])
  # # if seg_file["begin"].dtype != int and seg_file["begin"].dtype != float:
  # start_time = list(seg_file["begin"])/1000.0
  # # else:
  #   # start_time = seg_file["begin"]/1000.0
  # # if seg_file["end"].dtype != int and seg_file["begin"].dtype != float:
  # end_time = list(seg_file["end"])/1000.0
  # # else:
  #   # end_time = seg_file["end"]/1000.0
  
  
  # all_time_line[file_id] = list(zip(start_time, end_time))
  # print(list(zip(start_time, end_time)))
  # break
  labels[file_id] = label
  for idx, r in seg_file.iterrows():
    speaker = r['speaker']
    start_time = r['begin']
    end_time = r['end']

    if type(start_time) == str:
        start_time = float(start_time.strip(' ').strip('"'))
      
    if type(end_time) == str:
        end_time = float(end_time.strip(' ').strip('"'))

    # convert ms to seconds
    start_time = start_time / 1000.0
    end_time = end_time / 1000.0

    all_time_line[file_id].append((start_time, end_time))
  
  idx = 0
  while(1):
    try:
      if all_time_line[file_id][idx][1] >= all_time_line[file_id][idx+1][0]:
        temp = (all_time_line[file_id][idx][0], all_time_line[file_id][idx+1][1])
        all_time_line[file_id].pop(idx)
        all_time_line[file_id].pop(idx)
        all_time_line[file_id].insert(idx, temp)
      else:
        idx += 1
    except:
      break

###Some Auxiliary Function

In [6]:
def is_patient_utterance(speech_timeline, time_point):
    eps = 10e-4
    search_point = (time_point[0] + eps, time_point[1])
    idx = bisect.bisect_left(speech_timeline, search_point)
    # print(speech_timeline,search_point, idx)
    if idx == 0:
        return False
    return speech_timeline[idx - 1][0] <= time_point[0] <= time_point[1] <= speech_timeline[idx - 1][1]

In [7]:
def get_interval_from_file(file):
  segments = file["results"]['speaker_labels']['segments']
  start_time = []
  end_time = []
  for segment in segments:
    # print(segment['items'])
    for word in segment['items']:
      # print(word)
      start_time.append(float(word['start_time']))
      end_time.append(float(word['end_time']))
  interval = np.append(np.array([0]), np.array(start_time[1:]) - np.array(end_time[:-1]))
  return start_time, end_time, interval

In [8]:
def pause_representation(interval):
  if interval < 0.5:
    temp = pauses['short']
  elif interval < 1.0:
    temp = pauses['medium']
  else:
    temp = pauses['long']
  return temp

In [9]:
def remove_original_punctuation(file):
  for transcript in file["results"]["transcripts"]:
    # print(transcript['transcript'])
    text = re.sub('[{}]'.format(punctuation),'',transcript['transcript'])
    text = text.split(" ")
  return text

In [10]:
def insert_pause_representation(interval, text):
  pause_idx = np.nonzero(interval)[0]
  text = np.array(text)
  for idx in range(len(pause_idx)):
    # print(pause_idx[idx], pause_representation(interval[idx]))
    text = np.insert(text, pause_idx[idx], pause_representation(interval[idx]))
    pause_idx[idx:] += 1
  return text

In [11]:
def delete_other_speaker(all_time_line, file_id, start_time, end_time):
  time_line = all_time_line[file_id]
  del_index = []
  idx = 0
  for time_point in zip(start_time, end_time):
    if not is_patient_utterance(time_line, time_point):
      del_index.append(idx)
    idx += 1
  return del_index

###Preprocessing for Transcripts

In [12]:
ids = []
labels = []
texts = []
for filename in filenames:
  with open(filename) as f:
    file = json.load(f)
  filename = filename.decode("utf-8")
  file_id = filename.split(".wav")[0].split("-")[-1]
  start_time, end_time, interval = get_interval_from_file(file)
  del_index = delete_other_speaker(all_time_line, file_id, start_time, end_time)
  # print(list(zip(start_time, end_time)))
  # print(del_index)
  text = remove_original_punctuation(file)
  text = np.delete(text, del_index)
  interval = np.delete(interval, del_index)
  # print(len(text))
  # print(len(interval))
  # text = insert_pause_representation(interval, text)
  ids.append(file_id)
  labels.append(label_map[filename.split(".wav")[0].split("-")[-2]])
  texts.append(' '.join(str(i) for i in text))
  # result[file_id] = {"label": labels[file_id], "content": ' '.join(str(i) for i in text)}

### Test the different

In [None]:
filename = "/content/gdrive/MyDrive/DementiaBank.20210603/ADReSSo2021/diagnosis/train/transcription/original-AD-adrso039.wav-3.json"
segmentation = "/content/gdrive/MyDrive/DementiaBank.20210603/ADReSSo2021/diagnosis/train/segmentation/AD/adrso039.csv"

In [None]:
test_seg_file = pd.read_csv(segmentation)
test_seg_file

Unnamed: 0.1,Unnamed: 0,speaker,begin,end
0,1,PAR,0,4016
1,2,PAR,4016,8825
2,3,PAR,8825,13496
3,4,PAR,13496,18818
4,5,PAR,18818,24795
5,6,PAR,24795,36111
6,7,PAR,36111,38102
7,8,INV,38102,46770
8,9,PAR,46770,52935
9,10,PAR,52935,58299


In [None]:
test_seg_file = test_seg_file[test_seg_file["speaker"] == "PAR"]
test_seg_file.loc[:,["begin", "end"]] /= 1000.0
test_file_id = segmentation.split("/")[-1].split(".")[0]
test_seg_file

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


Unnamed: 0.1,Unnamed: 0,speaker,begin,end
0,1,PAR,0.0,4.016
1,2,PAR,4.016,8.825
2,3,PAR,8.825,13.496
3,4,PAR,13.496,18.818
4,5,PAR,18.818,24.795
5,6,PAR,24.795,36.111
6,7,PAR,36.111,38.102
8,9,PAR,46.77,52.935
9,10,PAR,52.935,58.299


In [None]:
pd.DataFrame(all_time_line[test_file_id], columns=['begin', 'end'])

Unnamed: 0,begin,end
0,0.0,38.102
1,46.77,58.299


In [None]:
test_time_line = []
for idx, r in test_seg_file.iterrows():
  speaker = r['speaker']
  start_time = r['begin']
  end_time = r['end']

  if type(start_time) == str:
      start_time = float(start_time.strip(' ').strip('"'))
    
  if type(end_time) == str:
      end_time = float(end_time.strip(' ').strip('"'))

  test_time_line.append((start_time, end_time))

#####Original Transcript

In [None]:
file["results"]["transcripts"][0]['transcript']

"Table seems found. The thing is running over chief for cooking, drying dishes, sauces long and then this. You see anything else? Well, yeah. Here's some outside wonder guarding against. Okay. Okay. Good."

#####Only selecting PAR speaking segmentation parts (no segmentation merging)

In [None]:
with open(filename) as f:
  file = json.load(f)
start_time, end_time, interval = get_interval_from_file(file)
del_index = []
idx = 0
for time_point in zip(start_time, end_time):
  if not is_patient_utterance(test_time_line, time_point):
    del_index.append(idx)
  idx += 1
text = remove_original_punctuation(file)
text = np.delete(text, del_index)
# interval = np.delete(interval, del_index)
# # print(len(text))
# # print(len(interval))
# # text = insert_pause_representation(interval, text)
' '.join(str(i) for i in text)

'Table seems The thing is running over chief for drying dishes sauces long and then this Well yeah Heres some outside guarding against'

#####Only selecting PAR speaking segmentation parts (segmentation merging)

In [None]:
with open(filename) as f:
  file = json.load(f)
# file_id = filename.split(".wav")[0].split("-")[-1]
start_time, end_time, interval = get_interval_from_file(file)
del_index = delete_other_speaker(all_time_line, test_file_id, start_time, end_time)
# print(list(zip(start_time, end_time)))
text = remove_original_punctuation(file)
text = np.delete(text, del_index)
interval = np.delete(interval, del_index)
# print(len(text))
# print(len(interval))
# text = insert_pause_representation(interval, text)
' '.join(str(i) for i in text)

'Table seems found The thing is running over chief for cooking drying dishes sauces long and then this Well yeah Heres some outside wonder guarding against'

###Output and some example

In [None]:
result = pd.DataFrame(data = {"Id":ids,"Label":labels,"Content":texts}, columns=["Id", "Label", "Content"])
result.to_csv('/content/gdrive/MyDrive/output.csv',index =False ,sep = ',')

In [None]:
result

Unnamed: 0,Id,Label,Content
0,adrso039,AD,"Table , seems found , The thing is , running o..."
1,adrso028,AD,"How she will , find her , mothers wish , Mhm ,..."
2,adrso031,AD,", what ... From what I can see hes going to up..."
3,adrso036,AD,", All the , Whats going on in this picture Was..."
4,adrso033,AD,"... Hey ... I , really , dont know , because t..."
...,...,...,...
161,adrso302,CN,"Egypt , Well start with the girl . Shes , goin..."
162,adrso274,CN,"Yeah , All right . boy is , taking a , cookie ..."
163,adrso280,CN,"... Uh huh , Yeah ... its full of ... boy , co..."
164,adrso307,CN,"... the boys , the girls making fun of the boy..."


In [None]:
' '.join(str(i) for i in text)

'. Mhm . Well . the sink is running over , Shes drying the dishes , Theyre getting in the cookie jar and theyre upsetting the stool , Well a , girl is reaching . for our cookie , Mhm , The lady here standing right in the water , She seems to be . looking out , the window , long , Thats about it'

###Model Training Part

In [13]:
train_texts = texts[: 133]
train_labels = labels[: 133]
val_texts = texts[133:]
val_labels = labels[133:]

In [14]:
import torch
import torchvision

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


In [15]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [16]:
class ADdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
train_dataset = ADdataset(train_encodings, train_labels)
val_dataset = ADdataset(val_encodings, val_labels)

In [18]:
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 256, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            
            nn.Linear(D_in, 128),
            nn.BatchNorm1d(num_features=128), ##
            nn.ReLU(),
            #nn.Dropout(0.3),
            nn.Linear(128, 1),
            #nn.BatchNorm1d(num_features=128),
            #nn.ReLU(),
            #nn.Linear(128, 1)
            
            #nn.BatchNorm1d(num_features=128), ###
            #nn.ReLU(),
            
            #nn.Dropout(0.5),

            #nn.Linear(H, D_out)
            #nn.Linear(128, 2)
            #nn.Linear(128, 1)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [19]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader


def initialize_model(epochs=50):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_loader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [20]:
import random
import time

# Specify loss function
#loss_fn = nn.CrossEntropyLoss()
#loss_fn = nn.BCELoss()
loss_fn =  torch.nn.BCEWithLogitsLoss()
def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_loader, optim, val_loader=None, epochs=50, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    train_loss_list = []
    val_loss_list = []
    train_acc_list = []
    val_acc_list = [] 
    for epoch_i in range(epochs):
        train_loss_sum = 0
        train_accuracy_epoch = 0
        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for batch in (train_loader):
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Perform a forward pass. This will return logits.
            logits = model(input_ids, attention_mask)

            # Compute loss and accumulate the loss values
            logits = logits.reshape(-1) #silebilirsin
            
            loss = loss_fn(logits, labels.float())
            train_loss_sum += loss.item()

            logits_class = logits > 0.5
            train_acc = (labels == logits_class).sum().item() / labels.size(0)
            train_accuracy_epoch += train_acc

            # Perform a backward pass to calculate gradients
            loss.backward()


            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

        avg_train_loss = np.round(train_loss_sum/len(train_loader),2)
        avg_train_acc = np.round(train_accuracy_epoch/len(train_loader),2)

        if evaluation == True:
            avg_val_loss, avg_val_acc = evaluate(model, val_loader)
        print('Epoch {}, train loss {} , val loss is {}, train acc is {}, val acc is {} '.format(epoch_i,avg_train_loss,avg_val_loss,avg_train_acc,avg_val_acc))
        train_loss_list.append(avg_train_loss)
        val_loss_list.append(avg_val_loss)
        val_acc_list.append(avg_val_acc)
        train_acc_list.append(avg_train_acc)

    print("Training complete!")
    return model,train_loss_list,val_loss_list,train_acc_list,val_acc_list


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_loss_sum = 0
    val_accuracy_epoch = 0
    # For each batch in our validation set...
    for batch in val_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      
      # Compute logits
      with torch.no_grad():
        logits = model(input_ids, attention_mask)

        # Compute loss
        logits = logits.reshape(-1)
        loss = loss_fn(logits, labels.float())
        val_loss_sum +=loss.item()
        avg_val_loss = np.round(val_loss_sum/len(val_dataloader),2)

        logits_class = logits > 0.5
        val_acc = (labels == logits_class).sum().item() / labels.size(0)
        val_accuracy_epoch += val_acc
        avg_val_acc = np.round(val_accuracy_epoch/len(val_dataloader),2)

    return avg_val_loss, avg_val_acc

In [None]:
set_seed(42)    # Set seed for reproducibility
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
bert_classifier,train_loss_list,val_loss_list,train_acc_list,val_acc_list = train(bert_classifier, train_loader, optimizer, val_loader, epochs=50, evaluation=True)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...



###Test

In [None]:
test_filenames = tf.io.gfile.glob("/content/gdrive/MyDrive/DementiaBank.20210603/ADReSSo2021/diagnosis/test(from\"Corrupted\")/transcriptions/*.json")
test_segmentations = tf.io.gfile.glob(
    "/content/gdrive/MyDrive/DementiaBank.20210603/ADReSSo2021/diagnosis/test(from\"Corrupted\")/segmentation/*.csv"
)
test_labels_csv = pd.read_csv("/content/gdrive/MyDrive/DementiaBank.20210603/ADReSSo2021/diagnosis/test(from\"Corrupted\")/test_results_task1_groundtruth.csv")

In [None]:
test_time_line = collections.defaultdict(list)
for test_segmentation in test_segmentations:
  seg_file = pd.read_csv(test_segmentation)
  seg_file = seg_file[seg_file["speaker"] == "PAR"]
  # print(segmentation)
  file_id = test_segmentation.split("/")[-1].split(".")[0]
  for idx, r in seg_file.iterrows():
    speaker = r['speaker']
    start_time = r['begin']
    end_time = r['end']

    if type(start_time) == str:
        start_time = float(start_time.strip(' ').strip('"'))
      
    if type(end_time) == str:
        end_time = float(end_time.strip(' ').strip('"'))

    # convert ms to seconds
    start_time = start_time / 1000.0
    end_time = end_time / 1000.0

    test_time_line[file_id].append((start_time, end_time))
  
  idx = 0
  while(1):
    try:
      if test_time_line[file_id][idx][1] >= test_time_line[file_id][idx+1][0]:
        temp = (test_time_line[file_id][idx][0], test_time_line[file_id][idx+1][1])
        test_time_line[file_id].pop(idx)
        test_time_line[file_id].pop(idx)
        test_time_line[file_id].insert(idx, temp)
      else:
        idx += 1
    except:
      break

In [None]:
test_label_map = {
    'Control': 0,
    'ProbableAD': 1
}
test_ids = []
test_labels = []
test_texts = []
for test_filename in test_filenames:
  with open(test_filename) as f:
    file = json.load(f)
  # test_filename = test_filename.decode("utf-8")
  file_id = test_filename.split(".wav")[0].split("-")[-1]
  start_time, end_time, interval = get_interval_from_file(file)
  del_index = delete_other_speaker(test_time_line, file_id, start_time, end_time)
  # print(file_id)
  # print(list(zip(start_time, end_time)))
  # print(del_index)
  text = remove_original_punctuation(file)
  text = np.delete(text, del_index)
  interval = np.delete(interval, del_index)
  # print(len(text))
  # print(len(interval))
  text = insert_pause_representation(interval, text)
  test_ids.append(file_id)
  test_labels.append(test_label_map[test_labels_csv.loc[test_labels_csv['ID'] == file_id]["Dx"].values[0]])
  test_texts.append(' '.join(str(i) for i in text))
  # result[file_id] = {"label": labels[file_id], "content": ' '.join(str(i) for i in text)}
  # break

In [None]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = ADdataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)
test_loss, test_accuracy = evaluate(bert_classifier, test_loader)
print("Test Loss: {}, Test Accuracy: {}".format(test_loss,test_accuracy))

Test Loss: 0.5, Test Accuracy: 0.72


###Visualization and Data Statistics

In [None]:
digital = []
data = {}
short_count_CN = np.zeros(79)
medium_count_CN = np.zeros(79)
long_count_CN = np.zeros(79)
short_count_AD = np.zeros(87)
medium_count_AD = np.zeros(87)
long_count_AD = np.zeros(87)
idx_CN = 0
idx_AD = 0
for id in ids:
  content = result["Content"]
  words = content.split(" ")
  label = result["Label"]
  data["id"] = id
  # print(data["id"],digital)
  data["label"] = label
  for word in words:
    # print(word)
    if label == "CN":
      if word == ",":
        short_count_CN[idx_CN] += 1
      elif word == ".":
        medium_count_CN[idx_CN] += 1
      elif word == "...":
        long_count_CN[idx_CN] += 1
    else:
      if word == ",":
        short_count_AD[idx_AD] += 1
      elif word == ".":
        medium_count_AD[idx_AD] += 1
      elif word == "...":
        long_count_AD[idx_AD] += 1


  if label == "CN":
    data["short_count"] = short_count_CN[idx_CN]
    data["medium_count"] = medium_count_CN[idx_CN]
    data["long_count"] = long_count_CN[idx_CN]
    idx_CN += 1 
  else:
    data["short_count"] = short_count_AD[idx_AD]
    data["medium_count"] = medium_count_AD[idx_AD]
    data["long_count"] = long_count_AD[idx_AD]
    idx_AD += 1
  digital.append(data.copy())
  # break
data = pd.DataFrame([[short_count_CN.sum(), medium_count_CN.sum(), long_count_CN.sum()],[short_count_AD.sum(), medium_count_AD.sum(), long_count_AD.sum()]], columns=["short", "medium", "long"], index=["CN", "AD"])

AttributeError: ignored

In [None]:
digital

In [None]:
plt.rcParams['figure.figsize'] = (8.0, 6.0)   #调整图片大小

from matplotlib.font_manager import FontProperties   #显示中文，并指定字体
myfont=FontProperties(fname=r'C:/Windows/Fonts/simhei.ttf',size=14)
sns.set(font=myfont.get_name())
plt.rcParams['axes.unicode_minus']=False      #显示负号

sns.set_palette(sns.color_palette('pastel'))  #设置调色板
data.plot.bar(stacked=True, alpha=0.5, )   

plt.xticks(fontsize=16, rotation=0)    #设置x和y轴刻度值的字体大小;rotation规定水平排列刻度文字。
plt.xticks(fontsize=16)    #设置x轴刻度值的字体大小
plt.yticks(fontsize=16)    #设置y轴刻度值的字体大小

plt.legend(fontsize=16)    #设置legend刻度值的字体大小

plt.yticks(np.arange(0, 251, 50)) #设置y轴标签
plt.show()

In [None]:
data = pd.DataFrame(digital)
sns.relplot(x = "id", data=data, hue="", col="label")

ValueError: ignored

In [None]:
data

Unnamed: 0,id,label,short_count,medium_count,long_count
0,adrso161,CN,13.0,0.0,0.0
1,adrso152,CN,46.0,4.0,0.0
2,adrso178,CN,19.0,3.0,0.0
3,adrso156,CN,23.0,2.0,5.0
4,adrso157,CN,29.0,2.0,2.0
...,...,...,...,...,...
161,adrso216,AD,25.0,3.0,18.0
162,adrso060,AD,15.0,0.0,3.0
163,adrso249,AD,9.0,2.0,0.0
164,adrso212,AD,37.0,0.0,6.0
