In [3]:
import json
import matplotlib.pyplot as plt
import numpy as np

## 画柱状图

In [None]:
# 设置图片大小和像素
plt.figure(figsize=(8,6),dpi=80)

# setting the datas
y1 = [531,
541,
507,
489,
514,
500]
y2 = [469,
459,
493,
511,
486,
500]

x = np.arange(len(y1))

# 设置柱体宽度
total_width, n = 0.7, 2
width = total_width / n
xticks_labels = ["birth date", "birth place", "company", "major", "university", "workplace"]
yticks_labels = [0, 100, 200, 300, 400, 500, 600, 700]

plt.barh(x - width/2, y1, height=width, fc="limegreen", label="general")
plt.barh(x + width/2, y2, height=width, fc="aqua", label="with spell/grammar error")
plt.yticks(x, xticks_labels)
plt.xticks(yticks_labels, yticks_labels)
# 设置标题
plt.title("General vs Spell/Grammar error",fontsize=16)

# 设置坐标轴名称
# plt.xlabel("features")
plt.xlabel("Sample size statistics")

# 设置图注
plt.legend()

plt.show()

In [None]:
# 设置图片大小和像素
plt.figure(figsize=(20,6),dpi=80)

x = np.arange(6)

# 设置柱体宽度
total_width, n = 0.7, 2
width = total_width / n
style_labels = ["Scientific_reports", "Novels", "Social_media", "Newspapers"]
xticks_labels = ["birth date", "birth place", "company", "major", "university", "workplace"]
yticks_labels = [0, 100, 200, 300, 400, 500, 600, 700, 800]

cur_sub_pic = 1
prompt_type = "statement"
for i in range(len(style_labels)):
    for j in range(i + 1, len(style_labels)):
        plt.subplot(2, 3, cur_sub_pic)
        cur_sub_pic += 1

        y1 = []
        y2 = []
        for feature in xticks_labels:
            with open("./{}_vs_{}_ppl_acc_{}_{}.json".format(style_labels[i], style_labels[j], feature.replace(" ", "_"), prompt_type), 'r', encoding='utf8') as f:
                cur_data = json.load(f)

            y1.append(cur_data["choice {}".format(style_labels[i])])
            y2.append(cur_data["choice {}".format(style_labels[j])])
        
        if sum(y1) < sum(y2):
            plt.barh(x - width/2, y1, height=width, fc="limegreen", label=style_labels[i])
            plt.barh(x + width/2, y2, height=width, fc="aqua", label=style_labels[j])
            plt.yticks(x, xticks_labels)
            plt.xticks(yticks_labels, yticks_labels)
            # 设置图注
            plt.legend()
        else:
            temp = y2
            y2 = y1
            y1 = temp

            plt.barh(x - width/2, y1, height=width, fc="limegreen", label=style_labels[j])
            plt.barh(x + width/2, y2, height=width, fc="aqua", label=style_labels[i])
            plt.yticks(x, xticks_labels)
            plt.xticks(yticks_labels, yticks_labels)
            # 设置图注
            plt.legend()
            
        # 设置坐标轴名称
        plt.xlabel("Sample size statistics")
            
# 设置标题
# plt.title("General vs Spell/Grammar error",fontsize=16)

plt.show()

## 聚类

### TSNE

In [1]:
import torch
import json
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.manifold import TSNE
from transformers import LlamaModel, AutoTokenizer

[2024-01-30 23:19:49,328] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
# HIGH_DIM_VECS = {
#     "Blogs": [],
#     "Personal Interviews": [],
#     "Tabloids": [],
#     "Textbooks": [],
#     "Forum discussions": [],
#     "Newspapers": [],
#     "Social media": [],
#     "Wikipedia": [],
#     "Scientific reports": [],
#     "Novels": []
# }
# TYPE_LIST = ["Scientific_reports", "Novels", "Forum_discussions", "Social_media", "Newspapers", "Wikipedia", "Blogs", "Personal_Interviews", "Textbooks", "Tabloids"]
HIGH_DIM_VECS = {
    "Newspapers": [],
    "Social media": [],
    "Scientific reports": [],
    "Novels": []
}
TYPE_LIST = ["Scientific_reports", "Novels", "Social_media", "Newspapers"]

In [None]:
# first, let's see the birth_date information~
def get_high_dim_vectors(model_name_or_path, test_file_path):
    model = LlamaModel.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    model.eval()

    with open(test_file_path, 'r', encoding='utf8') as f:
        total_data = json.load(f)
    for data_item in tqdm(total_data):
        text_list = data_preprocess(data_item, tokenizer)

        with torch.no_grad():
            for text_item in text_list:
                output = model(text_item["input_ids"])
                HIGH_DIM_VECS[text_item["style"]].append(output.last_hidden_state[0][text_item["birthday_start_pos"]:])
    
def data_preprocess(data_item, tokenizer):
    first_birth_date = data_item["first_type_info"]["birth_date"]
    second_birth_date = data_item["second_type_info"]["birth_date"]

    text_list = []

    for text in data_item["text_result"]:
        if first_birth_date in text:
            key_word = first_birth_date
            style = data_item["first_type_info"]["type_name"]
        elif second_birth_date in text:
            key_word = second_birth_date
            style = data_item["second_type_info"]["type_name"]
        else:
            raise ValueError

        pre_text = text.split(key_word)[0].strip()
        post_text = pre_text + ' ' + key_word

        pre_input_ids = tokenizer(pre_text, return_tensors='pt').input_ids
        post_input_ids = tokenizer(post_text, return_tensors='pt').input_ids

        input_ids = post_input_ids
        start_pos = len(pre_input_ids[0])

        text_list.append({
            "style": style,
            "input_ids": input_ids,
            "birthday_start_pos": start_pos
        })

    return text_list

def High_Dim_Vec_preprocess():
    for key in list(HIGH_DIM_VECS.keys()):
        HIGH_DIM_VECS[key] = torch.cat(HIGH_DIM_VECS[key], dim=0)

def use_sne():
    High_Dim_Vec_preprocess()
    
    input_vec = torch.cat([
        # HIGH_DIM_VECS["Blogs"], 
        # HIGH_DIM_VECS["Personal Interviews"],
        # HIGH_DIM_VECS["Tabloids"],
        # HIGH_DIM_VECS["Textbooks"],
        # HIGH_DIM_VECS["Forum discussions"],
        HIGH_DIM_VECS["Newspapers"],
        HIGH_DIM_VECS["Social media"],
        # HIGH_DIM_VECS["Wikipedia"],
        HIGH_DIM_VECS["Scientific reports"],
        HIGH_DIM_VECS["Novels"],
    ], dim=0)

    labels = torch.cat([
        # torch.ones(len(HIGH_DIM_VECS["Blogs"])) * 1,
        # torch.ones(len(HIGH_DIM_VECS["Personal Interviews"])) * 2,
        # torch.ones(len(HIGH_DIM_VECS["Tabloids"])) * 3,
        # torch.ones(len(HIGH_DIM_VECS["Textbooks"])) * 4,
        # torch.ones(len(HIGH_DIM_VECS["Forum discussions"])) * 5,
        torch.ones(len(HIGH_DIM_VECS["Newspapers"])) * 1,
        torch.ones(len(HIGH_DIM_VECS["Social media"])) * 2,
        # torch.ones(len(HIGH_DIM_VECS["Wikipedia"])) * 8,
        torch.ones(len(HIGH_DIM_VECS["Scientific reports"])) * 3,
        torch.ones(len(HIGH_DIM_VECS["Novels"])) * 4,
    ], dim=0)

    tsne = TSNE(n_components=2, learning_rate=100, perplexity=10).fit_transform(input_vec)

    plt.scatter(tsne[:, 0], tsne[:, 1], c=labels)

    plt.colorbar()
    plt.show()

def main():
    for i in range(len(TYPE_LIST)):
        for j in range(i + 1, len(TYPE_LIST)):
            type_i = TYPE_LIST[i]
            type_j = TYPE_LIST[j]

            model_path = "/opt/tiger/fake_arnold/{}_vs_{}/checkpoint-780".format(type_i, type_j)
            data_path = "./data_scripts/type_fights/bio_data_train_{}_vs_{}.json".format(type_i, type_j)

            get_high_dim_vectors(model_path, data_path)

    # use_sne()

main()

In [6]:
import random

Random = random.Random(666)

SAMPLE_HIGH_DIM_VECS = {}

for key in list(HIGH_DIM_VECS.keys()):
    origin_list = HIGH_DIM_VECS[key]
    Random.shuffle(origin_list)
    SAMPLE_HIGH_DIM_VECS[key] = origin_list[:500]

torch.save(SAMPLE_HIGH_DIM_VECS, "temp_result.pkl")

In [7]:
SAMPLE_HIGH_DIM_VECS.keys()

dict_keys(['Newspapers', 'Social media', 'Scientific reports', 'Novels'])

In [9]:
for key in list(SAMPLE_HIGH_DIM_VECS.keys()):
        SAMPLE_HIGH_DIM_VECS[key] = torch.cat(SAMPLE_HIGH_DIM_VECS[key], dim=0)


input_vec = torch.cat([
    SAMPLE_HIGH_DIM_VECS["Newspapers"],
    SAMPLE_HIGH_DIM_VECS["Social media"],
    SAMPLE_HIGH_DIM_VECS["Scientific reports"],
    SAMPLE_HIGH_DIM_VECS["Novels"],
], dim=0)

labels = torch.cat([
    torch.ones(len(SAMPLE_HIGH_DIM_VECS["Newspapers"])) * 1,
    torch.ones(len(SAMPLE_HIGH_DIM_VECS["Social media"])) * 2,
    torch.ones(len(SAMPLE_HIGH_DIM_VECS["Scientific reports"])) * 3,
    torch.ones(len(SAMPLE_HIGH_DIM_VECS["Novels"])) * 4,
], dim=0)

In [22]:
tsne = TSNE(n_components=2, learning_rate=100, perplexity=30).fit_transform(input_vec)

In [None]:
plt.scatter(tsne[:, 0], tsne[:, 1], c=labels, s=2)

plt.colorbar()
plt.show()

In [None]:
# first, let's see the birth_date information~
def get_high_dim_vectors(model_name_or_path, test_file_path):
    model = LlamaModel.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    model.eval()

    with open(test_file_path, 'r', encoding='utf8') as f:
        total_data = json.load(f)
    for data_item in tqdm(total_data):
        text_list = data_preprocess(data_item, tokenizer)

        with torch.no_grad():
            for text_item in text_list:
                output = model(text_item["input_ids"])
                HIGH_DIM_VECS[text_item["style"]].append(torch.mean(output.last_hidden_state[0], dim=0))
    
def data_preprocess(data_item, tokenizer):
    text_list = []

    for text in data_item["text_result"]:
        input_ids = tokenizer(text, return_tensors='pt').input_ids
        style = "None"
        for source in ["Scientific_reports_info", "Novels_info", "Forum_discussions_info", "Social_media_info", "Newspapers_info", "Wikipedia_info", "Blogs_info", "Personal_Interviews_info", "Textbooks_info", "Tabloids_info"]:
            if data_item[source]["birth_date"] in text:
                style = data_item[source]["type_name"]
                break
        
        assert style != "None"

        text_list.append({
            "style": style,
            "input_ids": input_ids,
        })

    return text_list

def High_Dim_Vec_preprocess():
    for key in list(HIGH_DIM_VECS.keys()):
        HIGH_DIM_VECS[key] = torch.stack(HIGH_DIM_VECS[key], dim=0)
        print(HIGH_DIM_VECS[key].shape)

def use_sne():
    High_Dim_Vec_preprocess()
    
    input_vec = torch.cat([
        HIGH_DIM_VECS["Blogs"], 
        HIGH_DIM_VECS["Personal Interviews"],
        HIGH_DIM_VECS["Tabloids"],
        HIGH_DIM_VECS["Textbooks"],
        HIGH_DIM_VECS["Forum discussions"],
        HIGH_DIM_VECS["Newspapers"],
        HIGH_DIM_VECS["Social media"],
        HIGH_DIM_VECS["Wikipedia"],
        HIGH_DIM_VECS["Scientific reports"],
        HIGH_DIM_VECS["Novels"],
    ], dim=0)

    print(input_vec.shape)

    labels = torch.cat([
        torch.ones(len(HIGH_DIM_VECS["Blogs"])) * 1,
        torch.ones(len(HIGH_DIM_VECS["Personal Interviews"])) * 2,
        torch.ones(len(HIGH_DIM_VECS["Tabloids"])) * 3,
        torch.ones(len(HIGH_DIM_VECS["Textbooks"])) * 4,
        torch.ones(len(HIGH_DIM_VECS["Forum discussions"])) * 5,
        torch.ones(len(HIGH_DIM_VECS["Newspapers"])) * 6,
        torch.ones(len(HIGH_DIM_VECS["Social media"])) * 7,
        torch.ones(len(HIGH_DIM_VECS["Wikipedia"])) * 8,
        torch.ones(len(HIGH_DIM_VECS["Scientific reports"])) * 9,
        torch.ones(len(HIGH_DIM_VECS["Novels"])) * 10,
    ], dim=0)

    pca = PCA(n_components=2)
    x_pca = pca.fit_transform(input_vec)

    plt.scatter(x_pca[:, 0], x_pca[:, 1], c=labels)

    plt.colorbar()
    plt.show()

def main():
    model_path = "./all_styles_together/checkpoint-3905"
    data_path = "./data_scripts/type_fights/bio_data_train_all_styles.json"

    get_high_dim_vectors(model_path, data_path)

    use_sne()

main()

In [None]:
def use_pca():
    plt.figure(figsize=(15,10),dpi=80)
    range = 1000
    input_vec = torch.cat([
        HIGH_DIM_VECS["Blogs"][:range], 
        HIGH_DIM_VECS["Personal Interviews"][:range],
        HIGH_DIM_VECS["Tabloids"][:range],
        HIGH_DIM_VECS["Textbooks"][:range],
        HIGH_DIM_VECS["Forum discussions"][:range],
        HIGH_DIM_VECS["Newspapers"][:range],
        HIGH_DIM_VECS["Social media"][:range],
        HIGH_DIM_VECS["Wikipedia"][:range],
        HIGH_DIM_VECS["Scientific reports"][:range],
        HIGH_DIM_VECS["Novels"][:range],
    ], dim=0)

    print(input_vec.shape)

    pca = PCA(n_components=2)
    x_pca = pca.fit_transform(input_vec)

    plt.scatter(x_pca[:, 0][:range], x_pca[:, 1][:range], label="Blogs", c="lightseagreen", s=1)
    plt.scatter(x_pca[:, 0][6 * range:7 * range], x_pca[:, 1][6 * range:7 * range], label="Social media", c='b', s=1)
    plt.scatter(x_pca[:, 0][4 * range:5 * range], x_pca[:, 1][4 * range:5 * range], label="Forum discussions", c='deepskyblue', s=1)
    plt.scatter(x_pca[:, 0][2 * range:3 * range], x_pca[:, 1][2 * range:3 * range], label="Tabloids",c='slateblue', s=1)

    plt.scatter(x_pca[:, 0][3 * range:4 * range], x_pca[:, 1][3 * range:4 * range], label="Textbooks", c='r', s=1)
    plt.scatter(x_pca[:, 0][7 * range:8 * range], x_pca[:, 1][7 * range:8 * range], label="Wikipedia", c="darkred", s=1)
    plt.scatter(x_pca[:, 0][8 * range:9 * range], x_pca[:, 1][8 * range:9 * range], label="Scientific reports",c="gold", s=1)    
    plt.scatter(x_pca[:, 0][5 * range:6 * range], x_pca[:, 1][5 * range:6 * range], label="Newspapers", c='orange', s=1)
    
    plt.scatter(x_pca[:, 0][9 * range:], x_pca[:, 1][9 * range:], label="Novels", c='green', s=1)
    plt.scatter(x_pca[:, 0][range:2 * range], x_pca[:, 1][range: 2 * range], label="Personal Interviews", c='gray', s=1)

    plt.legend(fontsize=8)
    plt.show()

use_pca()

## 折线图

In [1]:
import json
import matplotlib.pyplot as plt

In [None]:
model_size = ["14m", "70m", "160m", "410m", "1b", "1.4b", "2.8b", "6.9b"]
x = range(0, 8)
y = [450, 500, 550, 600, 650, 700, 750, 800, 900, 1000]
y_labels = ["45%", "50%", "55%", "60%", "65%", "70%", "75%", "80%", "90%", "100%"]
# 先只写newspaper的比例
prompt_type="statement"

data_collect = {
"birth_date" : [],
"birth_place" : [],
"company" : [],
"major" : [],
"university" : [],
"workplace" : [],
"avg" : [],
}

for feature_type in ["birth_date", "birth_place", "company", "major", "university", "workplace"]:
    for size in model_size:
        with open("./pythia_{}_Social_media_vs_Newspapers_ppl_acc_{}_{}.json".format(size, feature_type, prompt_type), 'r', encoding='utf8') as f:
            result = json.load(f)

        data_collect[feature_type].append(result["choice Newspapers"])

plt.figure(figsize=(9,5.562),dpi=80)
plt.title("Variation of model preferences with model scale")
plt.xticks(x, model_size)
plt.yticks(y, y_labels)

plt.plot(x, data_collect["birth_date"], label='birth date')
plt.plot(x, data_collect["birth_place"], label='birth place')
plt.plot(x, data_collect["company"], label='company')
plt.plot(x, data_collect["major"], label='major')
plt.plot(x, data_collect["university"], label='university')
plt.plot(x, data_collect["workplace"], label='work place')
# plt.plot(x, avg, label="avg")

plt.legend()

plt.show()

In [None]:
feature_list = ["paper_num", "sat_score", "birth_date"]
prompt_list = ["question", "statement"]
precentage_list = ["68%", "70%", "72%", "74%", "76%", "78%", "80%", "82%", "84%"]
x = range(3)
y = [680, 700, 720, 740, 760, 780, 800, 820, 840]

for prompt_type in prompt_list:
    results = []
    for feature in feature_list:
        with open("checkpoint-785_ppl_acc_{}_{}.json".format(feature, prompt_type), 'r', encoding='utf8') as f:
            cur_data = json.load(f)
        results.append(cur_data["choice Newspapers"])
    plt.plot(x, results, label="{} result".format(prompt_type))
    # plt.bar(x, results, label="{} result".format(prompt_type), bottom=65)

plt.xticks(x, feature_list)
plt.yticks(y, precentage_list)
plt.legend()

plt.show()

## 饼状图

In [None]:
# 设置图片大小和像素
plt.figure(figsize=(8,6),dpi=80)

x_labels = ["birth date", "birth place", "company", "major", "university"]
styles = ["Scientific_reports", "Wikipedia", "Newspapers", "Textbooks", "Novels", "Personal_Interviews", "Forum_discussions", "Social_media",  "Blogs", "Tabloids"]
colors = ["gold", "darkred", "orange", "r", "green", "gray", "deepskyblue", "b", "lightseagreen", "slateblue"]
x = np.arange(len(x_labels))
y = [0, 200, 400, 600, 800, 1000]
y_labels = ["0%", "20%", "40%", "60%", "80%", "100%"]
# bottom = [0] * len(x_labels)
nums = []

for sty_idx, style in enumerate(styles):
    style_result = []
    for feature in x_labels:
        with open("all_styles_together_ppl_acc_{}_statement.json".format(feature.replace(" ", "_")), 'r', encoding='utf8') as f:
            total_data = json.load(f)
        style_result.append(total_data["result"][style][0])

    nums.append(sum(style_result))

    # plt.bar(x, style_result, width=0.5, bottom=bottom, fc=colors[sty_idx], label=style.replace("_", " "))
    # bottom = list(map(lambda x, y: x + y, style_result, bottom))
plt.pie(nums, labels=styles, explode=[0.02] * 10)
plt.legend(bbox_to_anchor=(1.05, 0), loc=3, borderaxespad=0)

# plt.xticks(x, x_labels)
# plt.yticks(y, y_labels)
plt.show()