In [1]:
from typing import Text, Generator, Tuple, List, Optional, Dict, Set
import pandas as pd
import numpy as np
from ast import literal_eval
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import json
import re
import ast
from tabulate import tabulate
sns.set_theme()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', 5000)

# 1. Load Annotator Datasets

In [277]:
df_annotated_us_data_1 = pd.read_csv('data/user_stories/dataset/02_annotated_us_data/01_annotated_us_data.CSV', sep=";")
df_annotated_us_data_1 = df_annotated_us_data_1.rename(columns={"annotation": "annotation_1", "category": "category_1", "info": "info_1"})
df_annotated_us_data_1["annotation_1"] = df_annotated_us_data_1["annotation_1"].fillna("")
df_annotated_us_data_1["category_1"] = df_annotated_us_data_1["category_1"].fillna("")
df_annotated_us_data_1["info_1"] = df_annotated_us_data_1["info_1"].fillna("")

In [295]:
df_annotated_us_data_2 = pd.read_csv('data/user_stories/dataset/02_annotated_us_data/02_annotated_us_data.CSV', sep=";", encoding='latin')
df_annotated_us_data_2 = df_annotated_us_data_2.rename(columns={"Annotation": "annotation_2", "Category": "category_2"})
df_annotated_us_data_2["annotation_2"] = df_annotated_us_data_2["annotation_2"].fillna(0)
df_annotated_us_data_2["annotation_2"] = df_annotated_us_data_2["annotation_2"].astype(int)

In [299]:
print('Annotation_1 - Fraction of 0: {} ({})'.format(len(df_annotated_us_data_1[df_annotated_us_data_1['annotation_1']==0])/len(df_annotated_us_data_1), len(df_annotated_us_data_1[df_annotated_us_data_1['annotation_1']==0])))
print('Annotation_1 - Fraction of 1: {} ({})'.format(len(df_annotated_us_data_1[df_annotated_us_data_1['annotation_1']==1])/len(df_annotated_us_data_1), len(df_annotated_us_data_1[df_annotated_us_data_1['annotation_1']==1])))
print('Annotation_1 - Fraction of 2: {} ({})'.format(len(df_annotated_us_data_1[df_annotated_us_data_1['annotation_1']==2])/len(df_annotated_us_data_1), len(df_annotated_us_data_1[df_annotated_us_data_1['annotation_1']==2])))
print('Annotation_1 - Fraction of 0+0,4: {} ({})'.format(len(df_annotated_us_data_1[df_annotated_us_data_1["category_1"].str.contains("0|4")])/len(df_annotated_us_data_1[df_annotated_us_data_1['annotation_1']==0]), len(df_annotated_us_data_1[df_annotated_us_data_1["category_1"].str.contains("0|4")])))
print('--------------------------------------------------')
print('Annotation_2 - Fraction of 0: {} ({})'.format(len(df_annotated_us_data_2[df_annotated_us_data_2['annotation_2']==0])/len(df_annotated_us_data_2), len(df_annotated_us_data_1[df_annotated_us_data_2['annotation_2']==0])))
print('Annotation_2 - Fraction of 1: {} ({})'.format(len(df_annotated_us_data_2[df_annotated_us_data_2['annotation_2']==1])/len(df_annotated_us_data_2), len(df_annotated_us_data_1[df_annotated_us_data_2['annotation_2']==1])))
print('Annotation_2 - Fraction of 2: {} ({})'.format(len(df_annotated_us_data_2[df_annotated_us_data_2['annotation_2']==2])/len(df_annotated_us_data_2), len(df_annotated_us_data_1[df_annotated_us_data_2['annotation_2']==2])))

Annotation_1 - Fraction of 0: 0.41284403669724773 (135)
Annotation_1 - Fraction of 1: 0.5412844036697247 (177)
Annotation_1 - Fraction of 2: 0.045871559633027525 (15)
Annotation_1 - Fraction of 0+0,4: 0.3111111111111111 (42)
--------------------------------------------------
Annotation_2 - Fraction of 0: 0.25076452599388377 (82)
Annotation_2 - Fraction of 1: 0.6880733944954128 (225)
Annotation_2 - Fraction of 2: 0.06116207951070336 (20)


In [300]:
df_annotation_us_data_merged = pd.concat([df_annotated_us_data_1, df_annotated_us_data_2[['annotation_2', 'category_2', 'Goal', 'Unclear Function', 'Role', 'Improved Label', 'Too much on.', 'comments']]], axis=1)

# 2. Inter-Coder Realibility

In [302]:
from sklearn.metrics import cohen_kappa_score

In [305]:
y_1 = df_annotation_us_data_merged['annotation_1'].values.tolist()
y_2 = [int(elem) for elem in df_annotation_us_data_merged['annotation_2'].values.tolist()]
inter_annotator_ag_kappa = cohen_kappa_score(y_1, y_2)
print("Inter-Annotator Agreement (Cohen's Kappa): {}".format(inter_annotator_ag_kappa))

Inter-Annotator Agreement (Cohen's Kappa): 0.5482290881688019


# 3. Merge of Annotator Datasets

In [136]:
def agreement(anno_1, anno_2):
    if anno_1 == anno_2:
        if anno_1 == 1:
            return 1
        elif anno_1 == 2:
            return 2
        elif anno_1 == 0:
            return -1
    else:
        return 0

In [137]:
df_annotation_us_data_merged['agreement'] = df_annotation_us_data_merged.apply(lambda row: agreement(row['annotation_1'], row['annotation_2']), axis=1)

# 4. Resolution of Conflicts and Further Preprocessing

In [148]:
df_merged_annotation_resolution = pd.read_csv('data/user_stories/dataset/merged_annotation_data/merged_annotation_resolution.CSV', sep=";",  encoding='latin')

In [149]:
df_merged_annotation_resolution['resolution'] = df_merged_annotation_resolution.apply(lambda row: row['agreement'] if row['agreement']==1 else row['resolution'], axis=1)
df_merged_annotation_resolution["resolution"] = df_merged_annotation_resolution["resolution"].astype(int)

In [151]:
print('Resolution - Fraction of 0: {} ({})'.format(len(df_merged_annotation_resolution[df_merged_annotation_resolution['resolution']==0])/len(df_merged_annotation_resolution), len(df_merged_annotation_resolution[df_merged_annotation_resolution['resolution']==0])))
print('Resolution - Fraction of 1: {} ({})'.format(len(df_merged_annotation_resolution[df_merged_annotation_resolution['resolution']==1])/len(df_merged_annotation_resolution), len(df_merged_annotation_resolution[df_merged_annotation_resolution['resolution']==1])))
print('Resolution - Fraction of 2: {} ({})'.format(len(df_merged_annotation_resolution[df_merged_annotation_resolution['resolution']==2])/len(df_merged_annotation_resolution), len(df_merged_annotation_resolution[df_merged_annotation_resolution['resolution']==2])))

Resolution - Fraction of 0: 0.24464831804281345 (80)
Resolution - Fraction of 1: 0.7064220183486238 (231)
Resolution - Fraction of 2: 0.04892966360856269 (16)


In [153]:
df_merged_annotation_resolution_kept = df_merged_annotation_resolution[df_merged_annotation_resolution['resolution'] == 1].copy()
df_merged_annotation_resolution_kept["selected_comp_ids"] = df_merged_annotation_resolution_kept["selected_comp_ids"].fillna("")
df_merged_annotation_resolution_kept['comp_ids_final'] = df_merged_annotation_resolution_kept.apply(lambda row: row['selected_comp_ids'] if row['selected_comp_ids'] else row['comp_ids'], axis=1)
df_merged_annotation_resolution_kept_min = df_merged_annotation_resolution_kept[['user_story', 'rico_id', 'comp_ids_final']].copy()
df_merged_annotation_resolution_kept_min = df_merged_annotation_resolution_kept_min.rename(columns={"comp_ids_final": "comp_ids"})

In [174]:
df_merged_annotation_resolution_kept_min[:20]

Unnamed: 0,user_story,rico_id,comp_ids
0,"As a fitness enthusiast, I want to receive notifications about my daily calorie statistics so that I can stay informed and motivated throughout the day",203,151617
1,"As a regular user of the App, I want to get update reminders when a new version is available on google play so that I can take advantage of the latest features and improvements",203,121314
2,"As a data-driven dieter, I want to mark my days as 'complete' so that my information can be accurately included in weekly and monthly analyses.",203,678
4,"As a social media user, I want to connect the app with my facebook account so that i can share my progress and achievements with friends",203,012
5,As a User i want to see how much of the tutorials are left to estimate the progress already made,303,10730
6,"As a programming student, I want to see an overview of the programming course so that I can plan my learning path.",303,111210
7,As a user I want to skip tutorials to save time when I already know speciffic topics,303,6
11,"As a programming student, I want to have distinct learning modules so that I can select the right learning module for me.",303,"9,5,2, 8, 7, 4, 3, 1, 0, 11, 12, 10"
13,As a User I want to change the font size to make the text better reable,570,76
14,"As a student learning french, I want to be able to search for words both in english and french, so that I can learn new words more easily.",570,151614


In [175]:
len(df_merged_annotation_resolution_kept_min)

231

In [176]:
def preprocess_comp_ids(comp_ids_str):
    comp_ids = comp_ids_str.split(',')
    comp_ids = [elem.replace(' ', '') for elem in comp_ids if elem.replace(' ', '')]
    comp_ids = [elem for elem in comp_ids if elem.isdigit()]
    return comp_ids

In [177]:
df_merged_annotation_resolution_kept_min['comp_ids'] = df_merged_annotation_resolution_kept_min.apply(lambda row: preprocess_comp_ids(row['comp_ids']), axis=1)

In [179]:
df_merged_annotation_resolution_kept_min[:20]

Unnamed: 0,user_story,rico_id,comp_ids
0,"As a fitness enthusiast, I want to receive notifications about my daily calorie statistics so that I can stay informed and motivated throughout the day",203,"[15, 16, 17]"
1,"As a regular user of the App, I want to get update reminders when a new version is available on google play so that I can take advantage of the latest features and improvements",203,"[12, 13, 14]"
2,"As a data-driven dieter, I want to mark my days as 'complete' so that my information can be accurately included in weekly and monthly analyses.",203,"[6, 7, 8]"
4,"As a social media user, I want to connect the app with my facebook account so that i can share my progress and achievements with friends",203,"[0, 1, 2]"
5,As a User i want to see how much of the tutorials are left to estimate the progress already made,303,"[10, 7, 3, 0]"
6,"As a programming student, I want to see an overview of the programming course so that I can plan my learning path.",303,"[11, 12, 10]"
7,As a user I want to skip tutorials to save time when I already know speciffic topics,303,[6]
11,"As a programming student, I want to have distinct learning modules so that I can select the right learning module for me.",303,"[9, 5, 2, 8, 7, 4, 3, 1, 0, 11, 12, 10]"
13,As a User I want to change the font size to make the text better reable,570,"[7, 6]"
14,"As a student learning french, I want to be able to search for words both in english and french, so that I can learn new words more easily.",570,"[15, 16, 14]"


In [183]:
print('Fraction of single UIC US: {} ({})'.format(len(df_merged_annotation_resolution_kept_min[df_merged_annotation_resolution_kept_min['comp_ids'].apply(lambda x: len(x)>1)])/len(df_merged_annotation_resolution_kept_min), len(df_merged_annotation_resolution_kept_min[df_merged_annotation_resolution_kept_min['comp_ids'].apply(lambda x: len(x)>1)])))
print('Fraction of multiple UIC US: {} ({})'.format(len(df_merged_annotation_resolution_kept_min[df_merged_annotation_resolution_kept_min['comp_ids'].apply(lambda x: len(x)==1)])/len(df_merged_annotation_resolution_kept_min), len(df_merged_annotation_resolution_kept_min[df_merged_annotation_resolution_kept_min['comp_ids'].apply(lambda x: len(x)==1)])))

Fraction of single UIC US: 0.6060606060606061 (140)
Fraction of multiple UIC US: 0.3939393939393939 (91)


In [186]:
df_merged_annotation_resolution_kept_min.to_csv('data/user_stories/dataset/02_celaned_us_dataset/cleaned_us_dataset.csv', index=False)

# 5. Creating Train/Test Dataset

## 5.1 Split Train/Test Dataset

In [312]:
import random
random.seed(22)
rico_ids_unique = list(set(df_merged_annotation_resolution_kept_min['rico_id'].values.tolist()))
train_rico_ids = random.sample(rico_ids_unique, 6)

In [313]:
train_rico_ids

[10364, 3218, 4891, 3716, 4179, 6584]

In [315]:
df_train_rows = df_merged_annotation_resolution_kept_min[df_merged_annotation_resolution_kept_min['rico_id'].isin(train_rico_ids)]

In [316]:
len(df_train_rows)

21

In [317]:
df_train_rows

Unnamed: 0,user_story,rico_id,comp_ids
50,"As a student, I want to access the lesson again so that I can see what mistakes I made.",3218,"[8, 9]"
51,"As a student, I want to select the next lesson so that I can continue learning.",3218,"[6, 7]"
58,"As a veterinarian, I want to follow different dogs so that I can see if they show signs of illness.",3716,[25]
70,"i want to have a category section, where i can browse different shopping options.",4179,[26]
72,"I want to have a star icon on the product card, the remeber my store my favorite products",4179,"[14, 8]"
73,"As a user, I want to be able to go back so that I can always return and continue my previous search.",4891,[17]
74,"As a user, I want to see a picture of the food so that I can imagine what it will look like and judge whether I would like it.",4891,[15]
75,"As a user, I want to be able to save a particular meal for later, so that I don't have to search for it again and can just find it on my favourites page.",4891,[11]
76,As a user I want to know how long it will take me to cook the selected meal so that I can integrate it in my current week.,4891,[13]
77,As a user I want to add ingredients to a shopping list so that it is easier for me to go shopping and buy everything I need for the cooking.,4891,"[4, 2]"


In [226]:
df_test_rows = df_merged_annotation_resolution_kept_min[~df_merged_annotation_resolution_kept_min['rico_id'].isin(train_rico_ids)]

In [227]:
len(df_test_rows)

210

In [228]:
df_test_rows[:20]

Unnamed: 0,user_story,rico_id,comp_ids
0,"As a fitness enthusiast, I want to receive notifications about my daily calorie statistics so that I can stay informed and motivated throughout the day",203,"[15, 16, 17]"
1,"As a regular user of the App, I want to get update reminders when a new version is available on google play so that I can take advantage of the latest features and improvements",203,"[12, 13, 14]"
2,"As a data-driven dieter, I want to mark my days as 'complete' so that my information can be accurately included in weekly and monthly analyses.",203,"[6, 7, 8]"
4,"As a social media user, I want to connect the app with my facebook account so that i can share my progress and achievements with friends",203,"[0, 1, 2]"
5,As a User i want to see how much of the tutorials are left to estimate the progress already made,303,"[10, 7, 3, 0]"
6,"As a programming student, I want to see an overview of the programming course so that I can plan my learning path.",303,"[11, 12, 10]"
7,As a user I want to skip tutorials to save time when I already know speciffic topics,303,[6]
11,"As a programming student, I want to have distinct learning modules so that I can select the right learning module for me.",303,"[9, 5, 2, 8, 7, 4, 3, 1, 0, 11, 12, 10]"
13,As a User I want to change the font size to make the text better reable,570,"[7, 6]"
14,"As a student learning french, I want to be able to search for words both in english and french, so that I can learn new words more easily.",570,"[15, 16, 14]"


In [229]:
df_train_rows.to_csv('data/user_stories/dataset/05_test_train_split/train_us_data.csv', index=False)
df_test_rows.to_csv('data/user_stories/dataset/05_test_train_split/test_us_data.csv', index=False)

# 5.2 Create Binary Label

In [266]:
np.random.seed(0)

In [267]:
df_train_rows = df_train_rows.sample(frac=1).reset_index(drop=True)

# Calculate the midpoint index
midpoint = (len(df_train_rows) // 2)-1

# Assign 0s to the first half, and 1s to the second half
df_train_rows.loc[:midpoint, 'label'] = 0
df_train_rows.loc[midpoint+1:, 'label'] = 1

df_train_rows = df_train_rows.sample(frac=1).reset_index(drop=True)
df_train_rows["label"] = df_train_rows["label"].astype(int)

In [269]:
df_train_rows

Unnamed: 0,user_story,rico_id,comp_ids,label
0,As a user i want to be able to read news about transfer of player so I know which player changed to which team,10364,"[45, 46, 44]",0
1,"AS a enduser, i want to have a dispaly option at where i can see the total value of my current portfolio so that i have a complete overview over my finacial situation.",6584,"[51, 50, 54]",1
2,As a user I want to search for games or clubs to find the clubs of my choice quickly,10364,[48],1
3,As a user I want to add ingredients to a shopping list so that it is easier for me to go shopping and buy everything I need for the cooking.,4891,"[4, 2]",1
4,"AS a enduser, i want to have an add coin button, to add new coins to my portfolio.",6584,"[1, 0]",0
5,"As a user, I want to see a picture of the food so that I can imagine what it will look like and judge whether I would like it.",4891,[15],0
6,"As a user, I want to be able to save a particular meal for later, so that I don't have to search for it again and can just find it on my favourites page.",4891,[11],1
7,As a user I want to see the games of yesterday and tomorrow to anticipate future games and see the history of games of my clubs,10364,[49],1
8,As a user I want to know how long it will take me to cook the selected meal so that I can integrate it in my current week.,4891,[13],0
9,"As a student, I want to access the lesson again so that I can see what mistakes I made.",3218,"[8, 9]",1


In [271]:
df_test_rows = df_test_rows.sample(frac=1).reset_index(drop=True)

# Calculate the midpoint index
midpoint = (len(df_test_rows) // 2)-1

# Assign 0s to the first half, and 1s to the second half
df_test_rows.loc[:midpoint, 'label'] = 0
df_test_rows.loc[midpoint+1:, 'label'] = 1

df_test_rows = df_test_rows.sample(frac=1).reset_index(drop=True)

In [273]:
df_test_rows[:20]

Unnamed: 0,user_story,rico_id,comp_ids,label
0,"As a frequent app user, I want to refresh the content within the home screen, ensuring I have the latest information and resources available to me.",12750,[26],1
1,As a user I want to quickly see how much time a receipe takes for each step to quickyl know how time I woudl need to invest,23369,"[12, 11, 10, 9, 8, 7, 6, 5]",0
2,"As a parent of multiple children, I want to be able to enter the amount of children over or under the age of 12 I am traveling with, so the lower price for them is already included in the prices shown for my results.",16072,"[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]",0
3,"As a PRI user, I want to mark episodes as favorite so that I can watch them again later.",25008,"[11, 5, 0]",1
4,As a user I want to be able to select a league of my choice so I can see the up to date football results,5474,"[60, 61]",1
5,As a User I want to change the screen turn on setting so I can the text remains readable even when I dont though my phone for a long time,11264,"[1, 2]",1
6,"As someone who wants to support first aid education, I want to see a donation button, so I can easily find out how to give money to first aid services that need it.",17686,[5],1
7,"As a user, I want to be able to quickly switch between the different functions of the app, such as exercises, workouts, logs and body statistics, so that I can optimise my time and have a comprehensive training experience.",9525,"[10, 9, 8, 7, 6]",0
8,i want to have a menue button to alter existing shopping list in the preferred way.,10428,[3],0
9,"As a busy professional, I want to quickly check the store hours of my nearest store so that i can plan my visit according to my schedule",12905,"[16, 5]",1


In [275]:
df_train_rows.to_csv('data/user_stories/dataset/05_test_train_split/train_us_data_label.csv', index=False)
df_test_rows.to_csv('data/user_stories/dataset/05_test_train_split/test_us_data_label.csv', index=False)