# Общая информация
__Цель:__ сравнить качество генерируемых ответов на категории __API Usage__

__Задачи:__

1) Сравнить метрики для base и tuned модели на promt с title

2) Сравнить метрики tuned моделей с разлиными promt

3) Посмотреть распределение длин генерируемых ответов

4) Сделать выводы

In [95]:
import warnings

import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns


warnings.filterwarnings("ignore")

In [9]:
pd.set_option("display.max_colwidth", None)

# Фиксация сидов

In [2]:
torch.manual_seed(42)
np.random.seed(42)

# Подгрузка таблиц с evaluation стадии

## Загрузка таблиц

In [3]:
base_android_api_usage_200_title = pd.read_csv(
    "./artifacts/experiment_2/base_android_api_usage_200_title.csv"
)

tuned_android_api_usage_200_title = pd.read_csv(
    "./artifacts/experiment_2/tuned_android_api_usage_200_title.csv"
)

tuned_android_api_usage_200_notitle = pd.read_csv(
    "./artifacts/experiment_2/tuned_android_api_usage_200_notitle.csv"
)

tuned_android_api_usage_200_notitle_promt = pd.read_csv(
    "./artifacts/experiment_2/tuned_android_api_usage_200_notitle_promt.csv"
)

tuned_android_api_usage_200_notitle_promt_extra = pd.read_csv(
    "./artifacts/experiment_2/tuned_android_api_usage_200_notitle_promt_extra.csv"
)

# Анализ результатов

## Base GPT neo with _title_ and tuned

### Анализ длины текстовых данных

In [47]:
def add_answers_length(df):
    df['len_of_generated_answer'] = df.generated_answer.apply(lambda x: len(x.split(" ")))
    
    df['len_of_original_answer'] = df.original_answer.apply(lambda x: len(x.split(" ")))
    
    return df

In [48]:
base_android_api_usage_200_title = add_answers_length(base_android_api_usage_200_title)
tuned_android_api_usage_200_title = add_answers_length(tuned_android_api_usage_200_title)

In [49]:
base_android_api_usage_200_title.describe()

Unnamed: 0,bert_precision,bert_recall,bert_f1,rouge_score,bleu_score,len_of_generated_answer,len_of_original_answer,len_of_question
count,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0
mean,0.788507,0.829573,0.807832,0.169442,0.066427,118.140187,75.682243,84.46729
std,0.043672,0.023147,0.026949,0.078594,0.062822,87.630705,71.992478,41.706141
min,0.659623,0.757383,0.73853,0.028986,0.009607,5.0,3.0,24.0
25%,0.753994,0.813319,0.787766,0.113043,0.023726,47.0,29.5,53.0
50%,0.788992,0.829487,0.804358,0.161616,0.048711,95.0,56.0,74.0
75%,0.823735,0.845507,0.827233,0.225137,0.085731,181.5,92.5,107.5
max,0.875632,0.888714,0.865773,0.372549,0.423077,443.0,415.0,194.0


In [50]:
tuned_android_api_usage_200_title.describe()

Unnamed: 0,bert_precision,bert_recall,bert_f1,rouge_score,bleu_score,len_of_generated_answer,len_of_original_answer,len_of_question
count,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0
mean,0.852942,0.833993,0.842883,0.176406,0.190737,28.915888,75.682243,84.46729
std,0.033312,0.027466,0.023354,0.082926,0.091336,23.275674,71.992478,41.706141
min,0.725816,0.762835,0.765207,0.0,0.03838,2.0,3.0,24.0
25%,0.84472,0.816554,0.83067,0.118525,0.123947,15.0,29.5,53.0
50%,0.858884,0.834505,0.844695,0.166667,0.178571,23.0,56.0,74.0
75%,0.87571,0.853212,0.861413,0.24283,0.253165,35.0,92.5,107.5
max,0.906231,0.893505,0.887814,0.416667,0.636364,164.0,415.0,194.0


Видимо, модель сталагенерировать более короткие ответы и качество возрасло

### Анализ метрик

In [36]:
bert_f1 = base_android_api_usage_200_title.bert_f1.mean()
rouge_score = base_android_api_usage_200_title.rouge_score.mean()
bleu_score = base_android_api_usage_200_title.bleu_score.mean()

t_bert_f1 = tuned_android_api_usage_200_title.bert_f1.mean()
t_rouge_score = tuned_android_api_usage_200_title.rouge_score.mean()
t_bleu_score = tuned_android_api_usage_200_title.bleu_score.mean()

In [45]:
print(
    f"BERT score diff: {t_bert_f1 - bert_f1} ({round((t_bert_f1 - bert_f1)/bert_f1*100, 2)} %)",
    f"ROUGE score diff: {t_rouge_score - rouge_score} ({round((t_rouge_score - rouge_score)/rouge_score*100, 2)} %)",
    f"Unigram BLEU score diff: {t_bleu_score - bleu_score} ({round((t_bleu_score - bleu_score)/bleu_score*100, 2)} %)",
    sep="\n",
)

BERT score diff: 0.0350513413687733 (4.34 %)
ROUGE score diff: 0.0069643086243772745 (4.11 %)
Unigram BLEU score diff: 0.12430971742020101 (187.14 %)


## Сравнение качества tuned моделей в зависимости от promt

### Анализ длины текстовых данных

In [52]:
tuned_android_api_usage_200_notitle = add_answers_length(tuned_android_api_usage_200_notitle)
tuned_android_api_usage_200_notitle_promt = add_answers_length(tuned_android_api_usage_200_notitle_promt)
tuned_android_api_usage_200_notitle_promt_extra = add_answers_length(tuned_android_api_usage_200_notitle_promt_extra)

In [53]:
tuned_android_api_usage_200_notitle.describe()

Unnamed: 0,bert_precision,bert_recall,bert_f1,rouge_score,bleu_score,len_of_generated_answer,len_of_original_answer
count,107.0,107.0,107.0,107.0,107.0,107.0,107.0
mean,0.856318,0.832669,0.843705,0.18088,0.205926,28.242991,75.682243
std,0.039809,0.030661,0.027501,0.086484,0.101245,21.273973,71.992478
min,0.743945,0.74086,0.769436,0.0,0.040541,5.0,3.0
25%,0.84625,0.815447,0.831003,0.115525,0.120248,13.5,29.5
50%,0.865123,0.83588,0.846253,0.168831,0.188034,24.0,56.0
75%,0.881897,0.853167,0.860952,0.259259,0.27152,37.0,92.5
max,0.921064,0.903347,0.910454,0.384615,0.520833,122.0,415.0


In [54]:
tuned_android_api_usage_200_notitle_promt.describe()

Unnamed: 0,bert_precision,bert_recall,bert_f1,rouge_score,bleu_score,len_of_generated_answer,len_of_original_answer
count,107.0,107.0,107.0,107.0,107.0,107.0,107.0
mean,0.859578,0.836094,0.847218,0.190568,0.180078,29.803738,75.682243
std,0.03184,0.028183,0.022995,0.078888,0.089706,16.191915,71.992478
min,0.750759,0.768326,0.788356,0.0,0.05915,5.0,3.0
25%,0.849351,0.819898,0.833905,0.139482,0.128014,18.0,29.5
50%,0.86677,0.836763,0.849433,0.189189,0.159794,27.0,56.0
75%,0.877675,0.856771,0.860328,0.25,0.204452,39.0,92.5
max,0.934216,0.904736,0.918408,0.358974,0.526316,82.0,415.0


In [55]:
tuned_android_api_usage_200_notitle_promt_extra.describe()

Unnamed: 0,bert_precision,bert_recall,bert_f1,rouge_score,bleu_score,len_of_generated_answer,len_of_original_answer
count,107.0,107.0,107.0,107.0,107.0,107.0,107.0
mean,0.858334,0.83004,0.843645,0.18047,0.164922,38.233645,75.682243
std,0.022572,0.025682,0.018498,0.080014,0.083956,36.362841,71.992478
min,0.74603,0.746431,0.781868,0.0,0.011364,4.0,3.0
25%,0.850031,0.814378,0.832391,0.124462,0.111111,21.0,29.5
50%,0.860309,0.832973,0.845131,0.181818,0.147959,31.0,56.0
75%,0.871964,0.84651,0.856805,0.238314,0.200806,41.5,92.5
max,0.902353,0.889433,0.885226,0.385965,0.545455,306.0,415.0


In [81]:
tuned_android_api_usage_200_title.describe()

Unnamed: 0,bert_precision,bert_recall,bert_f1,rouge_score,bleu_score,len_of_generated_answer,len_of_original_answer,len_of_question
count,107.0,107.0,107.0,107.0,107.0,107.0,107.0,107.0
mean,0.852942,0.833993,0.842883,0.176406,0.190737,28.915888,75.682243,84.46729
std,0.033312,0.027466,0.023354,0.082926,0.091336,23.275674,71.992478,41.706141
min,0.725816,0.762835,0.765207,0.0,0.03838,2.0,3.0,24.0
25%,0.84472,0.816554,0.83067,0.118525,0.123947,15.0,29.5,53.0
50%,0.858884,0.834505,0.844695,0.166667,0.178571,23.0,56.0,74.0
75%,0.87571,0.853212,0.861413,0.24283,0.253165,35.0,92.5,107.5
max,0.906231,0.893505,0.887814,0.416667,0.636364,164.0,415.0,194.0


Boxplot и гистограммы для длины

In [88]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Box(y=base_android_api_usage_200_title.len_of_generated_answer, name="Base, title, no promt"), row=1, col=2
)

fig.add_trace(
    go.Box(y=tuned_android_api_usage_200_notitle.len_of_generated_answer, name="No title, no promt"), row=1, col=1
)

fig.add_trace(
    go.Box(y=tuned_android_api_usage_200_notitle_promt.len_of_generated_answer, name="No title, asking promt"),
    row=1,
    col=1,
)

fig.add_trace(
    go.Box(y=tuned_android_api_usage_200_notitle_promt_extra.len_of_generated_answer, name="No title, asking promt, one example"),
    row=1,
    col=1,
)

fig.add_trace(
    go.Box(y=tuned_android_api_usage_200_title.len_of_generated_answer, name="Yes title, no promt"),
    row=1,
    col=1,
)



fig.update_layout(
    height=600, width=1000, title_text="Length boxplot", yaxis_title="Legnth"
)
fig.update_traces(boxpoints="all", jitter=0.3)
fig.show()

### Анализ метрик

#### No title, yes/no asking promt

In [78]:
bert_f1_delta = tuned_android_api_usage_200_notitle_promt.bert_f1.mean() - tuned_android_api_usage_200_notitle.bert_f1.mean()
rouge_delta = tuned_android_api_usage_200_notitle_promt.rouge_score.mean() - tuned_android_api_usage_200_notitle.rouge_score.mean()
bleu_delta = tuned_android_api_usage_200_notitle_promt.bleu_score.mean() - tuned_android_api_usage_200_notitle.bleu_score.mean()

In [80]:
print(
    f"BERT score diff for yes/no promt: {bert_f1_delta} ({round((bert_f1_delta)/tuned_android_api_usage_200_notitle_promt.bert_f1.mean()*100, 2)} %)",
    f"ROUGE score diff for yes/no promt: {rouge_delta} ({round((rouge_delta)/tuned_android_api_usage_200_notitle_promt.rouge_score.mean()*100, 2)} %)",
    f"Unigram BLEU score diff for yes/no promt: {bleu_delta} ({round((bleu_delta)/tuned_android_api_usage_200_notitle_promt.bleu_score.mean()*100, 2)} %)",
    sep="\n",
)

BERT score diff for yes/no promt: 0.003512784699413296 (0.41 %)
ROUGE score diff for yes/no promt: 0.009688781101792193 (5.08 %)
Unigram BLEU score diff for yes/no promt: -0.025847417386581223 (-14.35 %)


Наличие promt (e.g. Answer the next question предположительно __улучшает__ качество

#### Yes/No title, no promt

In [84]:
bert_f1_delta = tuned_android_api_usage_200_title.bert_f1.mean() - tuned_android_api_usage_200_notitle.bert_f1.mean()
rouge_delta = tuned_android_api_usage_200_title.rouge_score.mean() - tuned_android_api_usage_200_notitle.rouge_score.mean()
bleu_delta = tuned_android_api_usage_200_title.bleu_score.mean() - tuned_android_api_usage_200_notitle.bleu_score.mean()

In [202]:
print(
    f"BERT score diff for yes/no title: {bert_f1_delta} ({round((bert_f1_delta)/tuned_android_api_usage_200_title.bert_f1.mean()*100, 2)} %)",
    f"ROUGE score diff for yes/no title: {rouge_delta} ({round((rouge_delta)/tuned_android_api_usage_200_title.rouge_score.mean()*100, 2)} %)",
    f"Unigram BLEU score diff for yes/no title: {bleu_delta} ({round((bleu_delta)/tuned_android_api_usage_200_title.bleu_score.mean()*100, 2)} %)",
    sep="\n",
)

BERT score diff for yes/no title: -0.0035726077088685804 (-0.42 %)
ROUGE score diff for yes/no title: -0.010098292580958207 (-5.72 %)
Unigram BLEU score diff for yes/no title: -0.015156614711748284 (-7.95 %)


Наличие Title (e.g. Title: ...\n) предположительно __ухудшает__ качество

#### No title, yes asking promt, yes/no 1 example

In [86]:
bert_f1_delta = tuned_android_api_usage_200_notitle_promt_extra.bert_f1.mean() - tuned_android_api_usage_200_notitle_promt.bert_f1.mean()
rouge_delta = tuned_android_api_usage_200_notitle_promt_extra.rouge_score.mean() - tuned_android_api_usage_200_notitle_promt.rouge_score.mean()
bleu_delta = tuned_android_api_usage_200_notitle_promt_extra.bleu_score.mean() - tuned_android_api_usage_200_notitle_promt.bleu_score.mean()

In [203]:
print(
    f"BERT score diff for yes/no extra example: {bert_f1_delta} ({round((bert_f1_delta)/tuned_android_api_usage_200_notitle_promt.bert_f1.mean()*100, 2)} %)",
    f"ROUGE score diff for yes/no extra example: {rouge_delta} ({round((rouge_delta)/tuned_android_api_usage_200_notitle_promt.rouge_score.mean()*100, 2)} %)",
    f"Unigram BLEU score diff for yes/no extra example: {bleu_delta} ({round((bleu_delta)/tuned_android_api_usage_200_notitle_promt.bleu_score.mean()*100, 2)} %)",
    sep="\n",
)

BERT score diff for yes/no extra example: -0.0035726077088685804 (-0.42 %)
ROUGE score diff for yes/no extra example: -0.010098292580958207 (-5.3 %)
Unigram BLEU score diff for yes/no extra example: -0.015156614711748284 (-8.42 %)


Наличие одного примера (e.g. Question...\nAnswer:...\n: ...\n) предположительно __ухудшает__ качество

### Метрики на одном полотне

In [197]:
names = ['base, yes title, no promt', 'tuned, yes title, no promt',\
         'tuned, no title, no promt', 'tuned, no title, asking promt',
        'tuned, no title, asking promt, one example']

bert_scores = [base_android_api_usage_200_title.bert_f1.mean(),
              tuned_android_api_usage_200_title.bert_f1.mean(),
              tuned_android_api_usage_200_notitle.bert_f1.mean(),
              tuned_android_api_usage_200_notitle_promt.bert_f1.mean(),
              tuned_android_api_usage_200_notitle_promt_extra.bert_f1.mean()]

rouge_scores = [base_android_api_usage_200_title.rouge_score.mean(),
              tuned_android_api_usage_200_title.rouge_score.mean(),
              tuned_android_api_usage_200_notitle.rouge_score.mean(),
              tuned_android_api_usage_200_notitle_promt.rouge_score.mean(),
              tuned_android_api_usage_200_notitle_promt_extra.rouge_score.mean()]

bleu_scores = [base_android_api_usage_200_title.bleu_score.mean(),
              tuned_android_api_usage_200_title.bleu_score.mean(),
              tuned_android_api_usage_200_notitle.bleu_score.mean(),
              tuned_android_api_usage_200_notitle_promt.bleu_score.mean(),
              tuned_android_api_usage_200_notitle_promt_extra.bleu_score.mean()]

df = pd.DataFrame({'BERT score': bert_scores,
                  'ROUGE score': rouge_scores,
                  'BLEU score': bleu_scores,
                  }, index=names)

In [201]:
fig = px.scatter(df.T)

fig.update_layout(
    height=600, width=1000, title_text="Metrics",
)

fig.show()