# Data cleaning
We want the class with labels as numerical value and the body with clean text.

This will remove:
* duplicates
* NaN entires
* non english
* url, html

* make it lowercase
* combine title and body

In [1]:
import pandas as pd
import sys
import numpy as np
sys.path.append("../../../scripts_shared/")
from preprocess_text import preprocess_text


In [2]:
# Read CSV into a dataframe
high_priority = pd.read_csv("csv/high/high_priority_no_td.csv", index_col=0)
medium_priority = pd.read_csv("csv/medium/medium_priority_no_td.csv", index_col=0)
low_priority = pd.read_csv("csv/low/low_priority_no_td.csv", index_col=0)

In [3]:
# Number or different labels
high_priority.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
bug high priority,10435
high priority,10360
High Priority,7484
enhancement high priority,5486
Priority: High,4751
priority.High,3694
priority.High type.Task,3685
priority.high type.task,3209
priority.High type.Story,2979
priority.high,2749


In [4]:
#Give each priority a label by number.
# 'Label encoding'. Makes is easier for machine learning models to work with categorical data.
high_priority["labels"] = 0
high_priority["class"] = "high_priority"
medium_priority["labels"] = 1
medium_priority["class"] = "medium_priority"
low_priority["labels"] = 2
low_priority["class"] = "low_priority"
high_priority.head()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class
0,11417540000.0,IssuesEvent,2020-02-03 00:00:44,unitystation/unitystation,https://api.github.com/repos/unitystation/unit...,closed,Client breaking NRE when using edit field on C...,0,### Bug:\r\n\r\nIf you use the edit field of t...,high_priority
1,11417540000.0,IssuesEvent,2020-02-03 00:01:26,zowe/sample-spring-boot-api-service,https://api.github.com/repos/zowe/sample-sprin...,closed,The SDK provides a separate Java (no-Spring) l...,0,- The commons-spring library is split into:\r\...,high_priority
2,11417550000.0,IssuesEvent,2020-02-03 00:02:58,openmsupply/mobile,https://api.github.com/repos/openmsupply/mobile,closed,Auto-log out after some time frame,0,## Is your feature request related to a proble...,high_priority
3,11417550000.0,IssuesEvent,2020-02-03 00:04:18,UltimateCodeMonkeys/CodeMonkeysMVVM,https://api.github.com/repos/UltimateCodeMonke...,opened,Migrate: CodeMonkeys ViewModelNavigationServic...,0,Migrate the Xamarin.Forms navigation service i...,high_priority
4,11417560000.0,IssuesEvent,2020-02-03 00:08:03,wordpress-mobile/WordPress-Android,https://api.github.com/repos/wordpress-mobile/...,closed,IA Reader filter bottom sheet: manage untitled...,0,In the filter bottom sheet we introduced in th...,high_priority


In [5]:
priority = pd.concat([high_priority, medium_priority, low_priority] , ignore_index = True)

In [6]:
priority[priority["repo"] == "python/mypy"]

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class


In [7]:
# Remove mypy from the dataset
priority = priority[priority["repo"] != "python/mypy"]
priority

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class
0,1.141754e+10,IssuesEvent,2020-02-03 00:00:44,unitystation/unitystation,https://api.github.com/repos/unitystation/unit...,closed,Client breaking NRE when using edit field on C...,0,### Bug:\r\n\r\nIf you use the edit field of t...,high_priority
1,1.141754e+10,IssuesEvent,2020-02-03 00:01:26,zowe/sample-spring-boot-api-service,https://api.github.com/repos/zowe/sample-sprin...,closed,The SDK provides a separate Java (no-Spring) l...,0,- The commons-spring library is split into:\r\...,high_priority
2,1.141755e+10,IssuesEvent,2020-02-03 00:02:58,openmsupply/mobile,https://api.github.com/repos/openmsupply/mobile,closed,Auto-log out after some time frame,0,## Is your feature request related to a proble...,high_priority
3,1.141755e+10,IssuesEvent,2020-02-03 00:04:18,UltimateCodeMonkeys/CodeMonkeysMVVM,https://api.github.com/repos/UltimateCodeMonke...,opened,Migrate: CodeMonkeys ViewModelNavigationServic...,0,Migrate the Xamarin.Forms navigation service i...,high_priority
4,1.141756e+10,IssuesEvent,2020-02-03 00:08:03,wordpress-mobile/WordPress-Android,https://api.github.com/repos/wordpress-mobile/...,closed,IA Reader filter bottom sheet: manage untitled...,0,In the filter bottom sheet we introduced in th...,high_priority
...,...,...,...,...,...,...,...,...,...,...
821952,2.060586e+10,IssuesEvent,2022-03-06 23:45:23,bounswe/bounswe2022group1,https://api.github.com/repos/bounswe/bounswe20...,closed,Editing Navigator of Wiki,2,The navigator of the wiki should be edited and...,low_priority
821953,7.334740e+09,IssuesEvent,2018-03-06 00:10:40,hoodedice/notes,https://api.github.com/repos/hoodedice/notes,opened,Check if passwords match,2,JavaScript code (or see if possible without) t...,low_priority
821954,7.334876e+09,IssuesEvent,2018-03-06 00:53:49,zephyrproject-rtos/zephyr,https://api.github.com/repos/zephyrproject-rto...,closed,Add doc to samples/bluetooth/mesh & samples/bl...,2,We should document what exactly the sample is ...,low_priority
821955,7.335047e+09,IssuesEvent,2018-03-06 01:48:07,uwnrg/minotaur-cpp,https://api.github.com/repos/uwnrg/minotaur-cpp,closed,Fix the photo in about dialog xd,2,,low_priority


In [8]:
# Drop duplicates by the content of the body
priority = priority.drop_duplicates(subset=['title'], keep='last')
priority.dropna(inplace=True)
priority.reset_index(inplace=True)
priority.drop(columns=["index"] , inplace= True)
priority["class"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority.drop(columns=["index"] , inplace= True)


class
high_priority      236610
medium_priority    160163
low_priority       143121
Name: count, dtype: int64

In [9]:
# Split the dataframe 
high_priority = priority.loc[priority['class'] == 'high_priority'].copy()
medium_priority = priority.loc[priority['class'] == 'medium_priority'].copy()
low_priority = priority.loc[priority['class'] == 'low_priority'].copy()

In [10]:
high_label_counts = high_priority["class"].value_counts()
medium_label_counts = medium_priority["class"].value_counts()
low_label_counts = low_priority["class"].value_counts()
hp_count = high_label_counts["high_priority"]
print(f"hp count: {hp_count}")
mp_count = medium_label_counts["medium_priority"]
print(f"med count: {mp_count}")
lp_count = low_label_counts["low_priority"]
print(f"low count: {lp_count}")

hp count: 236610
med count: 160163
low count: 143121


In [11]:
high_priority = high_priority.sample(frac=lp_count/hp_count, random_state=42)
high_and_low_priority = pd.concat([high_priority, low_priority] , ignore_index = True)
high_and_low_priority["class"].value_counts()

class
high_priority    143121
low_priority     143121
Name: count, dtype: int64

In [12]:
high_and_low_priority["labels"] = 1
medium_priority["labels"] = 0
high_and_low_priority["class"] = "high_and_low_priority"
high_and_low_priority["class"].value_counts()

class
high_and_low_priority    286242
Name: count, dtype: int64

In [13]:
high_and_low_label_counts = high_and_low_priority["class"].value_counts()
hl_count = high_and_low_label_counts["high_and_low_priority"]
hl_count

286242

In [14]:

high_and_low_priority = high_and_low_priority.sample(frac=mp_count/hl_count, random_state=42)
all_priority = pd.concat([medium_priority, high_and_low_priority] , ignore_index = True)
all_priority["class"].value_counts()

class
medium_priority          160163
high_and_low_priority    160163
Name: count, dtype: int64

In [15]:
print(all_priority["title"][0])
print(all_priority["body"][0])

Increase Indent of ListItem
ListItems should have greater indent to differentiate them from ListTitle


In [16]:
# Copy content of body to a new col named text
all_priority["text"] = all_priority["title"] + all_priority["body"]
all_priority.tail()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class,text
320321,25070810000.0,IssuesEvent,2022-11-07 12:00:35,zeitgeistpm/zeitgeist,https://api.github.com/repos/zeitgeistpm/zeitg...,closed,Oracle bond is not slashed when market resolve...,1,This is a bug in `on_resolution` which can eas...,high_and_low_priority,Oracle bond is not slashed when market resolve...
320322,7177764000.0,IssuesEvent,2018-01-31 14:39:59,CCAFS/MARLO,https://api.github.com/repos/CCAFS/MARLO,closed,Develop a summary report that generates the PO...,1,- [x] Create template with Pentaho Report Desi...,high_and_low_priority,Develop a summary report that generates the PO...
320323,15952000000.0,IssuesEvent,2021-04-15 10:32:06,ContinualAI/avalanche,https://api.github.com/repos/ContinualAI/avala...,closed,Synaptic Intelligence implementation differs f...,1,I was looking at Synaptic Intelligence impleme...,high_and_low_priority,Synaptic Intelligence implementation differs f...
320324,18909550000.0,IssuesEvent,2021-11-16 12:46:46,epam/Indigo,https://api.github.com/repos/epam/Indigo,closed,Modernize Indigo Service,1,We need to:\r\n\r\n- [x] Update from Flask to ...,high_and_low_priority,Modernize Indigo ServiceWe need to:\r\n\r\n- [...
320325,13565350000.0,IssuesEvent,2020-09-18 11:33:12,itggot-TE4/Fabulous_Pirates,https://api.github.com/repos/itggot-TE4/Fabulo...,closed,"Teachers can play a ""game"" based on the select...",1,- [x] Loads images from backend\r\n\r\n- [x] T...,high_and_low_priority,"Teachers can play a ""game"" based on the select..."


In [17]:
all_priority["text"][0]

'Increase Indent of ListItemListItems should have greater indent to differentiate them from ListTitle'

In [18]:
# Make a new dataframe with only text, label and class cols.
all_priority_subset = all_priority[["text" , "labels" , "class"]]
all_priority_subset

Unnamed: 0,text,labels,class
0,Increase Indent of ListItemListItems should ha...,0,medium_priority
1,Add as_mcmc and as_mcmc_list wrappers on as.mc...,0,medium_priority
2,Create New Option For Dropdown Scriptable Obje...,0,medium_priority
3,Header takes too much vertical space on wide s...,0,medium_priority
4,PageBody extends past html containerWhen navig...,0,medium_priority
...,...,...,...
320321,Oracle bond is not slashed when market resolve...,1,high_and_low_priority
320322,Develop a summary report that generates the PO...,1,high_and_low_priority
320323,Synaptic Intelligence implementation differs f...,1,high_and_low_priority
320324,Modernize Indigo ServiceWe need to:\r\n\r\n- [...,1,high_and_low_priority


In [19]:
# Convert to string
all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)


In [20]:
all_priority_subset

Unnamed: 0,text,labels,class,text_str
0,Increase Indent of ListItemListItems should ha...,0,medium_priority,Increase Indent of ListItemListItems should ha...
1,Add as_mcmc and as_mcmc_list wrappers on as.mc...,0,medium_priority,Add as_mcmc and as_mcmc_list wrappers on as.mc...
2,Create New Option For Dropdown Scriptable Obje...,0,medium_priority,Create New Option For Dropdown Scriptable Obje...
3,Header takes too much vertical space on wide s...,0,medium_priority,Header takes too much vertical space on wide s...
4,PageBody extends past html containerWhen navig...,0,medium_priority,PageBody extends past html containerWhen navig...
...,...,...,...,...
320321,Oracle bond is not slashed when market resolve...,1,high_and_low_priority,Oracle bond is not slashed when market resolve...
320322,Develop a summary report that generates the PO...,1,high_and_low_priority,Develop a summary report that generates the PO...
320323,Synaptic Intelligence implementation differs f...,1,high_and_low_priority,Synaptic Intelligence implementation differs f...
320324,Modernize Indigo ServiceWe need to:\r\n\r\n- [...,1,high_and_low_priority,Modernize Indigo ServiceWe need to:\r\n\r\n- [...


In [21]:
# Clean the data.
all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)


In [22]:
# Make a subset with text_clean and label
priority_label_text = all_priority_subset[["text_clean" , "labels", "class"]]
priority_label_text

Unnamed: 0,text_clean,labels,class
0,increase indent of listitemlistitems should ha...,0,medium_priority
1,add asmcmc and asmcmclist wrappers on asmcmc a...,0,medium_priority
2,create new option for dropdown scriptable obje...,0,medium_priority
3,header takes too much vertical space on wide s...,0,medium_priority
4,pagebody extends past html containerwhen navig...,0,medium_priority
...,...,...,...
320321,oracle bond is not slashed when market resolve...,1,high_and_low_priority
320322,develop a summary report that generates the po...,1,high_and_low_priority
320323,synaptic intelligence implementation differs f...,1,high_and_low_priority
320324,modernize indigo servicewe need to x update fr...,1,high_and_low_priority


In [23]:
# Need to dropna here since cleaning function returns NaN for not english text.
priority_label_text.dropna(inplace=True)
priority_label_text.reset_index(inplace=True)
priority_label_text.drop(columns=["index"] , inplace= True)

priority_label_text

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.drop(columns=["index"] , inplace= True)


Unnamed: 0,text_clean,labels,class
0,increase indent of listitemlistitems should ha...,0,medium_priority
1,add asmcmc and asmcmclist wrappers on asmcmc a...,0,medium_priority
2,create new option for dropdown scriptable obje...,0,medium_priority
3,header takes too much vertical space on wide s...,0,medium_priority
4,pagebody extends past html containerwhen navig...,0,medium_priority
...,...,...,...
298197,oracle bond is not slashed when market resolve...,1,high_and_low_priority
298198,develop a summary report that generates the po...,1,high_and_low_priority
298199,synaptic intelligence implementation differs f...,1,high_and_low_priority
298200,modernize indigo servicewe need to x update fr...,1,high_and_low_priority


In [24]:
# Splitting the DataFrame
test_df = priority_label_text.sample(frac=0.05, random_state=1)  # Select 5% of the data
test_file_name = f"csv/clean_test_med_vs_high_low_priority.csv"
test_df.to_csv(test_file_name, index=False)
priority_df = priority_label_text.drop(test_df.index)
priority_df.reset_index(drop=True, inplace=True)
priority_df

Unnamed: 0,text_clean,labels,class
0,increase indent of listitemlistitems should ha...,0,medium_priority
1,add asmcmc and asmcmclist wrappers on asmcmc a...,0,medium_priority
2,create new option for dropdown scriptable obje...,0,medium_priority
3,header takes too much vertical space on wide s...,0,medium_priority
4,pagebody extends past html containerwhen navig...,0,medium_priority
...,...,...,...
283287,oracle bond is not slashed when market resolve...,1,high_and_low_priority
283288,develop a summary report that generates the po...,1,high_and_low_priority
283289,synaptic intelligence implementation differs f...,1,high_and_low_priority
283290,modernize indigo servicewe need to x update fr...,1,high_and_low_priority


In [25]:
# Clean dataset with clean text and labels.
# 0 = high priority, 1 = not high priority
file_name = f"csv/clean_med_vs_high_and_low_priority.csv"
priority_df.to_csv(file_name, index=False)

In [26]:
pri = pd.read_csv(file_name)
pri

Unnamed: 0,text_clean,labels,class
0,increase indent of listitemlistitems should ha...,0,medium_priority
1,add asmcmc and asmcmclist wrappers on asmcmc a...,0,medium_priority
2,create new option for dropdown scriptable obje...,0,medium_priority
3,header takes too much vertical space on wide s...,0,medium_priority
4,pagebody extends past html containerwhen navig...,0,medium_priority
...,...,...,...
283287,oracle bond is not slashed when market resolve...,1,high_and_low_priority
283288,develop a summary report that generates the po...,1,high_and_low_priority
283289,synaptic intelligence implementation differs f...,1,high_and_low_priority
283290,modernize indigo servicewe need to x update fr...,1,high_and_low_priority


In [27]:
test_df = pd.read_csv(test_file_name)
test_df

Unnamed: 0,text_clean,labels,class
0,how can jooq sql without table name the jooq g...,0,medium_priority
1,as a user with income i can set recurring inco...,0,medium_priority
2,proposal backend lock session after survey is ...,0,medium_priority
3,no live d3200 what steps will reproduce the pr...,0,medium_priority
4,improve the cithis is a mini roadmap for the c...,0,medium_priority
...,...,...,...
14905,a user can be shown autocomplete information f...,0,medium_priority
14906,avoid line break within a breadcrumb itemsee l...,1,high_and_low_priority
14907,investigate issue with oppiabots commentsoppia...,0,medium_priority
14908,korean with comment keyword makes unicodeencod...,0,medium_priority
