# Download Dataset

In [20]:
!wget https://tickettagger.blob.core.windows.net/datasets/dataset-labels-top3-30k-real.txt -P ./dataset/

--2022-02-28 16:40:46--  https://tickettagger.blob.core.windows.net/datasets/dataset-labels-top3-30k-real.txt
Resolving tickettagger.blob.core.windows.net (tickettagger.blob.core.windows.net)... 52.239.251.68
Connecting to tickettagger.blob.core.windows.net (tickettagger.blob.core.windows.net)|52.239.251.68|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22598414 (22M) [text/plain]
Saving to: ‘dataset-labels-top3-30k-real.txt’


2022-02-28 16:40:49 (8.77 MB/s) - ‘dataset-labels-top3-30k-real.txt’ saved [22598414/22598414]



# lmport Libs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import neattext.functions as nfx
import re

# Prepare Dataset

In [2]:
df = pd.read_csv('./dataset/dataset-labels-top3-30k-real.txt', header=None)

In [3]:
df

Unnamed: 0,0
0,__label__enhancement Exception handling # Desc...
1,__label__enhancement Subpasses in MSL and HLSL...
2,__label__enhancement Collect new discovered no...
3,__label__enhancement Lock property : several p...
4,__label__bug Return default location type labe...
...,...
29995,__label__enhancement [TW-241] new column forma...
29996,__label__bug Error When using the example. ...
29997,__label__bug CorrelationId should be generated...
29998,__label__enhancement screenshot sidecar should...


In [4]:
# Cleaning the text
df = df[0].str.split(r'(__label__enhancement)|(__label__bug)|(__label__question)', expand=True)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4
0,,__label__enhancement,,,Exception handling # Description The methods...
1,,__label__enhancement,,,Subpasses in MSL and HLSL Sorry for all the i...
2,,__label__enhancement,,,Collect new discovered nodes Collect new adde...
3,,__label__enhancement,,,Lock property : several properties at once
4,,,__label__bug,,Return default location type label if custom ...


In [6]:
# retrieve data to separate dataframe for each labels
enh_df = df[df[1] == '__label__enhancement'][[1, 4]]
bug_df = df[df[2] == '__label__bug'][[2, 4]]
question_df = df[df[3] == '__label__question'][[3, 4]]

In [7]:
# assign column names to each df
enh_df.columns = ['labels', 'descriptions']
bug_df.columns = ['labels', 'descriptions']
question_df.columns = ['labels', 'descriptions']

print(enh_df.head())
print(bug_df.head())
print(question_df.head())

                 labels                                       descriptions
0  __label__enhancement   Exception handling # Description  The methods...
1  __label__enhancement   Subpasses in MSL and HLSL Sorry for all the i...
2  __label__enhancement   Collect new discovered nodes Collect new adde...
3  __label__enhancement        Lock property : several properties at once 
5  __label__enhancement                         Randomize: remove sliders 
          labels                                       descriptions
4   __label__bug   Return default location type label if custom ...
6   __label__bug   [TW-417] JSON export has extra commas (on 2.1...
7   __label__bug   System.AccessViolationException: my program t...
8   __label__bug   Topic mining fails with UnicodeEncodeError ``...
10  __label__bug   Preflight and non-batching requests throw an ...
                labels                                       descriptions
12   __label__question   Avoiding the default NotEmpty::IS_EMPTY val

In [8]:
# remove the prefix '__label__' string from column labels
enh_df['labels'] = enh_df['labels'].str.replace('__label__', '')
bug_df['labels'] = bug_df['labels'].str.replace('__label__', '')
question_df['labels'] = question_df['labels'].str.replace('__label__', '')

In [9]:
# strip away and leading & trailing spaces
enh_df['descriptions'] = enh_df['descriptions'].str.strip(' ')
bug_df['descriptions'] = bug_df['descriptions'].str.strip(' ')
question_df['descriptions'] = question_df['descriptions'].str.strip(' ')

In [10]:
# concatenate each df to single dataframe
df = pd.concat([enh_df, bug_df, question_df])

In [11]:
df

Unnamed: 0,labels,descriptions
0,enhancement,Exception handling # Description The methods ...
1,enhancement,Subpasses in MSL and HLSL Sorry for all the is...
2,enhancement,Collect new discovered nodes Collect new added...
3,enhancement,Lock property : several properties at once
5,enhancement,Randomize: remove sliders
...,...,...
29940,question,小白问题：监控页面配置了密码访问，如何通过http获取View JSON。 如题
29942,question,Possible to support 1.0.0-* and 2.0.0-rc2-0792...
29950,question,Input/output error when getattr() called when ...
29983,question,No example ? I don't see any application of p...


In [12]:
# remove any non-english samples
# reference: https://stackoverflow.com/a/150078/9960491
reg_obj = re.compile(r'[^\u0000-\u007F]+', re.UNICODE)
def is_english(desc):
    return (False if reg_obj.match(desc) else True)

df['is_english'] = df['descriptions'].apply(is_english)

In [13]:
df

Unnamed: 0,labels,descriptions,is_english
0,enhancement,Exception handling # Description The methods ...,True
1,enhancement,Subpasses in MSL and HLSL Sorry for all the is...,True
2,enhancement,Collect new discovered nodes Collect new added...,True
3,enhancement,Lock property : several properties at once,True
5,enhancement,Randomize: remove sliders,True
...,...,...,...
29940,question,小白问题：监控页面配置了密码访问，如何通过http获取View JSON。 如题,False
29942,question,Possible to support 1.0.0-* and 2.0.0-rc2-0792...,True
29950,question,Input/output error when getattr() called when ...,True
29983,question,No example ? I don't see any application of p...,True


In [14]:
# retrieve the english samples based on intermediate is_english column
df = df[df['is_english']][['labels', 'descriptions']]
df

Unnamed: 0,labels,descriptions
0,enhancement,Exception handling # Description The methods ...
1,enhancement,Subpasses in MSL and HLSL Sorry for all the is...
2,enhancement,Collect new discovered nodes Collect new added...
3,enhancement,Lock property : several properties at once
5,enhancement,Randomize: remove sliders
...,...,...
29918,question,Stream file failed. Hello. When I tried to im...
29942,question,Possible to support 1.0.0-* and 2.0.0-rc2-0792...
29950,question,Input/output error when getattr() called when ...
29983,question,No example ? I don't see any application of p...


In [15]:
# remove the stopwords, emojis from the text and convert it into lower case
def neatify_text(text):
    text = str(text).lower()
    text = nfx.remove_stopwords(text)
    text = nfx.remove_emojis(text)
#     text = nfx.remove_puncts(text)
    return text

df['descriptions'] = df['descriptions'].apply(neatify_text)

In [16]:
df

Unnamed: 0,labels,descriptions
0,enhancement,exception handling # description methods use t...
1,enhancement,subpasses msl hlsl sorry issues given followi...
2,enhancement,collect new discovered nodes collect new added...
3,enhancement,lock property : properties
5,enhancement,randomize: remove sliders
...,...,...
29918,question,stream file failed. hello. tried implement ogv...
29942,question,possible support 1.0.0-* 2.0.0-rc2-0792 sql db...
29950,question,input/output error getattr() called entered mo...
29983,question,example ? application prooph repo committed ?


In [17]:
RANDOM_SEED = 40

In [18]:
# split dataset to train and dev samples
x_train, x_test, y_train, y_test = train_test_split(df['descriptions'], df['labels'],
                                                 test_size=0.3,
                                                 random_state=RANDOM_SEED)

In [19]:
df_train = pd.concat([x_train, y_train], axis=1).reset_index(drop=True)
df_eval = pd.concat([x_test, y_test], axis=1).reset_index(drop=True)

In [20]:
df_train

Unnamed: 0,descriptions,labels
0,default choice appear allow null = false ### e...,bug
1,"set font-awesome/dashicons 'on' default? hey, ...",enhancement
2,practice numbers math functions. useful write ...,enhancement
3,getting loot buying items play animation buy s...,bug
4,timedevent causing massive lag https://i.imgur...,bug
...,...,...
20599,aosw model pulling entire configuration instan...,bug
20600,"find module (userid) hi, like thanks package, ...",question
20601,ccminer - gpu #0: illegal memory access encoun...,bug
20602,[tw-1] recurring task message task _profpatsch...,bug


In [21]:
df_eval

Unnamed: 0,descriptions,labels
0,rename seasson lifeperiod place dates story later,enhancement
1,"new label: ""pending closure"" type: @brainwane ...",question
2,stations departure? departure: ``` https://vbb...,question
3,[prestação de contas] quebra de página na comp...,bug
4,http parser/serializer error ```swift router.g...,bug
...,...,...
8826,[tw-344] `tasksh` segmentation fault _hector a...,bug
8827,adjustments allow increase inventories decreas...,enhancement
8828,request: implement/expose ttf font size `83 46...,enhancement
8829,"leaving editactivity looses data leaving ""edit...",enhancement


In [22]:
# save train and eval dataframes to csv
df_train.to_csv("./dataset/train.csv")
df_eval.to_csv("./dataset/eval.csv")
