# Data cleaning
We want the class with labels as numerical value and the body with clean text.

This will remove:
* duplicates
* NaN entires
* non english
* url, html

* make it lowercase
* combine title and body

In [1]:
import pandas as pd
import sys
import numpy as np
sys.path.append("../../../scripts_shared/")
from preprocess_text import preprocess_text


In [2]:
# Read CSV into a dataframe
#high_priority = pd.read_csv("csv/high_priority_no_td.csv", index_col=0)
medium_priority = pd.read_csv("csv/medium/medium_priority_no_td.csv", index_col=0)
#low_priority = pd.read_csv("csv/low_priority_no_td.csv", index_col=0)
not_priority = pd.read_csv("csv/random/random_no_td.csv", index_col=0)

In [3]:
# Number or different labels
medium_priority.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
auto-migrated Priority-Medium Type-Defect,22849
Medium Priority,3415
Priority: Medium,3042
bug imported Priority-Medium,2846
auto-migrated Priority-Medium Type-Enhancement,2167
medium priority,2051
bug priority: medium,2013
enhancement medium priority,1905
priority.Medium,1653
migrated Priority: Medium Type: Bug,1650


In [4]:
#Give each priority a label by number.
# 'Label encoding'. Makes is easier for machine learning models to work with categorical data.
medium_priority["labels"] = 0
medium_priority["class"] = "medium_priority"
not_priority["labels"] = 1
not_priority["class"] = "not_priority"

In [5]:
priority = pd.concat([medium_priority, not_priority] , ignore_index = True)

In [6]:
priority[priority["repo"] == "python/mypy"]

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class
269799,1.142706e+10,IssuesEvent,2020-02-03 23:33:27,python/mypy,https://api.github.com/repos/python/mypy,closed,Type of conditional expression is object,1,The type of the following conditional expressi...,not_priority
269828,1.142707e+10,IssuesEvent,2020-02-03 23:35:24,python/mypy,https://api.github.com/repos/python/mypy,closed,Type inference of Tuples returns object instead,1,While interacting with `zip` I encountered a v...,not_priority
285297,1.362788e+10,IssuesEvent,2020-09-24 13:13:03,python/mypy,https://api.github.com/repos/python/mypy,opened,Daemon support for --follow-imports=silent,1,"After #5870 is done, it would be nice to also ...",not_priority
293814,1.363241e+10,IssuesEvent,2020-09-24 19:36:27,python/mypy,https://api.github.com/repos/python/mypy,opened,regression: assignment of 'builtins.type' now ...,1,\r\n**Bug Report**\r\n\r\n#7963 causes a serio...,not_priority
311346,2.407484e+10,IssuesEvent,2022-09-18 17:03:35,python/mypy,https://api.github.com/repos/python/mypy,opened,stubtest: more concise error for forgotten arg...,1,**Feature**\r\n\r\nWhen the implementation add...,not_priority
...,...,...,...,...,...,...,...,...,...,...
2192518,7.349652e+09,IssuesEvent,2018-03-08 11:27:27,python/mypy,https://api.github.com/repos/python/mypy,closed,Crash when serializing property with forward r...,1,This serialization test case fails with a cras...,not_priority
2193294,7.349972e+09,IssuesEvent,2018-03-08 12:44:13,python/mypy,https://api.github.com/repos/python/mypy,opened,Suggest type annotations for partial types spa...,1,We don't support inferring partial types from ...,not_priority
2193307,7.349979e+09,IssuesEvent,2018-03-08 12:45:41,python/mypy,https://api.github.com/repos/python/mypy,closed,What to do about inferring types from multiple...,1,Typically mypy infers the type of the variable...,not_priority
2198060,1.785710e+10,IssuesEvent,2021-09-05 08:41:21,python/mypy,https://api.github.com/repos/python/mypy,opened,Name already defined error with warlus operato...,1,"Invalid error for mypy: Name ""x"" already defin...",not_priority


In [7]:
# Remove mypy from the dataset
priority = priority[priority["repo"] != "python/mypy"]
priority

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class
0,1.141757e+10,IssuesEvent,2020-02-03 00:11:29,poissonconsulting/nlist,https://api.github.com/repos/poissonconsulting...,closed,"tidy.nlists should return table with term, est...",0,,medium_priority
1,1.141781e+10,IssuesEvent,2020-02-03 01:38:13,ericauv/ericauv-portfolio,https://api.github.com/repos/ericauv/ericauv-p...,closed,Increase Indent of ListItem,0,ListItems should have greater indent to differ...,medium_priority
2,1.141805e+10,IssuesEvent,2020-02-03 02:48:17,poissonconsulting/nlist,https://api.github.com/repos/poissonconsulting...,closed,deprecate as.nlist and as.nlists for as_nlist ...,0,,medium_priority
3,1.141809e+10,IssuesEvent,2020-02-03 03:00:58,poissonconsulting/nlist,https://api.github.com/repos/poissonconsulting...,opened,Add as_mcmc and as_mcmc_list wrappers on as.mc...,0,"but\r\n```\r\n> as_mcmc(nlist(x = matrix(1:6, ...",medium_priority
4,1.141821e+10,IssuesEvent,2020-02-03 03:33:19,poissonconsulting/nlist,https://api.github.com/repos/poissonconsulting...,opened,Add as.nlist.mcmc and as.nlists.mcmc,0,,medium_priority
...,...,...,...,...,...,...,...,...,...,...
2211587,8.109961e+09,IssuesEvent,2018-08-14 09:21:15,urbit/arvo,https://api.github.com/repos/urbit/arvo,opened,%bad-text trips up hall JSON conversion,1,"Haven't tested this in detail yet, but pretty ...",not_priority
2211588,8.109961e+09,IssuesEvent,2018-08-14 09:21:16,highcharts/highcharts-react,https://api.github.com/repos/highcharts/highch...,closed,HighMaps mapBubble type,1,"Hello,\r\nIt is possible to create also HighMa...",not_priority
2211589,8.109982e+09,IssuesEvent,2018-08-14 09:25:07,Loriowar/comindivion,https://api.github.com/repos/Loriowar/comindivion,opened,Add a validation on a belonging of a predicate...,1,Now user can change html content of the intera...,not_priority
2211590,8.109983e+09,IssuesEvent,2018-08-14 09:25:24,TiiQu-Network/TQ-test-page,https://api.github.com/repos/TiiQu-Network/TQ-...,closed,Error on submit with no values,1,TypeError: Too few arguments in function sum (...,not_priority


In [8]:
# Drop duplicates by the content of the body
priority = priority.drop_duplicates(subset=['title'], keep='last')
priority.dropna(inplace=True)
priority.reset_index(inplace=True)
priority.drop(columns=["index"] , inplace= True)
priority["class"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority.drop(columns=["index"] , inplace= True)


class
not_priority       1348848
medium_priority     160379
Name: count, dtype: int64

In [9]:
# Split the dataframe 
medium_priority = priority.loc[priority['class'] == 'medium_priority'].copy()
not_priority = priority.loc[priority['class'] == 'not_priority'].copy()

In [10]:
medium_label_counts = medium_priority["class"].value_counts()
not_label_counts = not_priority["class"].value_counts()
mp_count = medium_label_counts["medium_priority"]
print(f"med count: {mp_count}")
not_count = not_label_counts["not_priority"]
print(f"not count: {not_count}")

med count: 160379
not count: 1348848


In [11]:
not_priority = not_priority.sample(frac=mp_count/not_count, random_state=42)
med_vs_not = pd.concat([medium_priority, not_priority] , ignore_index = True)
med_vs_not["class"].value_counts()

class
medium_priority    160379
not_priority       160379
Name: count, dtype: int64

In [12]:
print(med_vs_not["title"][0])
print(med_vs_not["body"][0])

Increase Indent of ListItem
ListItems should have greater indent to differentiate them from ListTitle


In [13]:
# Copy content of body to a new col named text
med_vs_not["text"] = med_vs_not["title"] + med_vs_not["body"]
med_vs_not.tail()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class,text
320753,27130610000.0,IssuesEvent,2023-02-16 09:31:00,parlemonde/1village,https://api.github.com/repos/parlemonde/1village,closed,FRONT - Page 2 : Identifiant,1,- [x] Créer page \n- [x] Ajouter élèves\n- [x]...,not_priority,FRONT - Page 2 : Identifiant- [x] Créer page ...
320754,26621130000.0,IssuesEvent,2023-01-24 11:18:15,LivelyKernel/lively.next,https://api.github.com/repos/LivelyKernel/live...,closed,New Connection on sidebar confusing and unintu...,1,I find the New Connection using the sidebar co...,not_priority,New Connection on sidebar confusing and unintu...
320755,27124480000.0,IssuesEvent,2023-02-16 03:16:25,cityofaustin/atd-data-tech,https://api.github.com/repos/cityofaustin/atd-...,closed,Update number of projects visible by default i...,1,<!-- Email -->\n<!-- meredith.quick@austintexa...,not_priority,Update number of projects visible by default i...
320756,13627650000.0,IssuesEvent,2020-09-24 12:54:26,homebridge/HAP-NodeJS,https://api.github.com/repos/homebridge/HAP-No...,closed,Homekit TELEVISION Accessory icon iPadOS 14,1,Hi Devs!\r\n\r\nJust installed iPadOS 14 GM Se...,not_priority,Homekit TELEVISION Accessory icon iPadOS 14Hi ...
320757,5578005000.0,IssuesEvent,2017-03-28 11:11:57,SemsProject/MOST,https://api.github.com/repos/SemsProject/MOST,closed,don't compute diffs twice,1,## Trac Ticket #8\n**component:** website\n**o...,not_priority,don't compute diffs twice## Trac Ticket #8\n**...


In [14]:
med_vs_not["text"][0]

'Increase Indent of ListItemListItems should have greater indent to differentiate them from ListTitle'

In [15]:
# Make a new dataframe with only text, label and class cols.
all_priority_subset = med_vs_not[["text" , "labels" , "class"]]
all_priority_subset

Unnamed: 0,text,labels,class
0,Increase Indent of ListItemListItems should ha...,0,medium_priority
1,Add as_mcmc and as_mcmc_list wrappers on as.mc...,0,medium_priority
2,Create New Option For Dropdown Scriptable Obje...,0,medium_priority
3,Header takes too much vertical space on wide s...,0,medium_priority
4,PageBody extends past html containerWhen navig...,0,medium_priority
...,...,...,...
320753,FRONT - Page 2 : Identifiant- [x] Créer page ...,1,not_priority
320754,New Connection on sidebar confusing and unintu...,1,not_priority
320755,Update number of projects visible by default i...,1,not_priority
320756,Homekit TELEVISION Accessory icon iPadOS 14Hi ...,1,not_priority


In [16]:
# Convert to string
all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)


In [17]:
all_priority_subset

Unnamed: 0,text,labels,class,text_str
0,Increase Indent of ListItemListItems should ha...,0,medium_priority,Increase Indent of ListItemListItems should ha...
1,Add as_mcmc and as_mcmc_list wrappers on as.mc...,0,medium_priority,Add as_mcmc and as_mcmc_list wrappers on as.mc...
2,Create New Option For Dropdown Scriptable Obje...,0,medium_priority,Create New Option For Dropdown Scriptable Obje...
3,Header takes too much vertical space on wide s...,0,medium_priority,Header takes too much vertical space on wide s...
4,PageBody extends past html containerWhen navig...,0,medium_priority,PageBody extends past html containerWhen navig...
...,...,...,...,...
320753,FRONT - Page 2 : Identifiant- [x] Créer page ...,1,not_priority,FRONT - Page 2 : Identifiant- [x] Créer page ...
320754,New Connection on sidebar confusing and unintu...,1,not_priority,New Connection on sidebar confusing and unintu...
320755,Update number of projects visible by default i...,1,not_priority,Update number of projects visible by default i...
320756,Homekit TELEVISION Accessory icon iPadOS 14Hi ...,1,not_priority,Homekit TELEVISION Accessory icon iPadOS 14Hi ...


In [18]:
# Clean the data.
all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)


In [19]:
# Make a subset with text_clean and label
priority_label_text = all_priority_subset[["text_clean" , "labels", "class"]]
priority_label_text

Unnamed: 0,text_clean,labels,class
0,increase indent of listitemlistitems should ha...,0,medium_priority
1,add asmcmc and asmcmclist wrappers on asmcmc a...,0,medium_priority
2,create new option for dropdown scriptable obje...,0,medium_priority
3,header takes too much vertical space on wide s...,0,medium_priority
4,pagebody extends past html containerwhen navig...,0,medium_priority
...,...,...,...
320753,,1,not_priority
320754,new connection on sidebar confusing and unintu...,1,not_priority
320755,update number of projects visible by default i...,1,not_priority
320756,homekit television accessory icon ipados 14hi ...,1,not_priority


In [20]:
# Need to dropna here since cleaning function returns NaN for not english text.
priority_label_text.dropna(inplace=True)
priority_label_text.reset_index(inplace=True)
priority_label_text.drop(columns=["index"] , inplace= True)

priority_label_text

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.drop(columns=["index"] , inplace= True)


Unnamed: 0,text_clean,labels,class
0,increase indent of listitemlistitems should ha...,0,medium_priority
1,add asmcmc and asmcmclist wrappers on asmcmc a...,0,medium_priority
2,create new option for dropdown scriptable obje...,0,medium_priority
3,header takes too much vertical space on wide s...,0,medium_priority
4,pagebody extends past html containerwhen navig...,0,medium_priority
...,...,...,...
290388,kfrg republic airportnew yorkusathis one is re...,1,not_priority
290389,new connection on sidebar confusing and unintu...,1,not_priority
290390,update number of projects visible by default i...,1,not_priority
290391,homekit television accessory icon ipados 14hi ...,1,not_priority


In [21]:
# Splitting the DataFrame
test_df = priority_label_text.sample(frac=0.05, random_state=1)  # Select 5% of the data
test_file_name = f"csv/clean_test_med_vs_random.csv"
test_df.to_csv(test_file_name, index=False)
priority_df = priority_label_text.drop(test_df.index)
priority_df.reset_index(drop=True, inplace=True)
priority_df

Unnamed: 0,text_clean,labels,class
0,increase indent of listitemlistitems should ha...,0,medium_priority
1,add asmcmc and asmcmclist wrappers on asmcmc a...,0,medium_priority
2,create new option for dropdown scriptable obje...,0,medium_priority
3,header takes too much vertical space on wide s...,0,medium_priority
4,pagebody extends past html containerwhen navig...,0,medium_priority
...,...,...,...
275868,kfrg republic airportnew yorkusathis one is re...,1,not_priority
275869,new connection on sidebar confusing and unintu...,1,not_priority
275870,update number of projects visible by default i...,1,not_priority
275871,homekit television accessory icon ipados 14hi ...,1,not_priority


In [22]:
# Clean dataset with clean text and labels.
# 0 = high priority, 1 = not high priority
file_name = f"csv/clean_med_vs_random.csv"
priority_df.to_csv(file_name, index=False)

In [23]:
pri = pd.read_csv(file_name)
pri

Unnamed: 0,text_clean,labels,class
0,increase indent of listitemlistitems should ha...,0,medium_priority
1,add asmcmc and asmcmclist wrappers on asmcmc a...,0,medium_priority
2,create new option for dropdown scriptable obje...,0,medium_priority
3,header takes too much vertical space on wide s...,0,medium_priority
4,pagebody extends past html containerwhen navig...,0,medium_priority
...,...,...,...
275868,kfrg republic airportnew yorkusathis one is re...,1,not_priority
275869,new connection on sidebar confusing and unintu...,1,not_priority
275870,update number of projects visible by default i...,1,not_priority
275871,homekit television accessory icon ipados 14hi ...,1,not_priority


In [24]:
test_df = pd.read_csv(test_file_name)
test_df

Unnamed: 0,text_clean,labels,class
0,automatically add priority to the task as the ...,0,medium_priority
1,apiregister endpoint for iot devices to regist...,1,not_priority
2,bug rename webhookbackground in knativebuild i...,1,not_priority
3,pciebackend spurious crash during recoverydue ...,1,not_priority
4,make outtargetdist directory match artifactszi...,1,not_priority
...,...,...,...
14515,buyer interactions post release emails additio...,1,not_priority
14516,glfw3 build error please use the generated is...,1,not_priority
14517,fe implement a warning popup to prevent data l...,0,medium_priority
14518,add a comment field to the individual stations...,0,medium_priority
