# Data cleaning
We want the class with labels as numerical value and the body with clean text.

This will remove:
* duplicates
* NaN entires
* non english
* url, html

* make it lowercase
* combine title and body

In [1]:
import pandas as pd
import sys
import numpy as np
sys.path.append("../../../scripts_shared/")
from preprocess_text import preprocess_text


In [2]:
# Read CSV into a dataframe
#high_priority = pd.read_csv("csv/high_priority_no_td.csv", index_col=0)
#medium_priority = pd.read_csv("csv/medium_priority_no_td.csv", index_col=0)
low_priority = pd.read_csv("csv/low/low_priority_no_td.csv", index_col=0)
not_priority = pd.read_csv("csv/random/random_no_td.csv", index_col=0)

In [3]:
# Number or different labels
low_priority.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
enhancement low priority,5496
low priority,5485
bug low priority,2798
Low Priority,2732
pri:low pri:lowest,2692
Priority: Low,2504
priority: low,1191
Priority: Low Type: Enhancement,1155
bug priority: low,1154
auto-migrated Priority-Low Type-Enhancement,1150


In [4]:
not_priority.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
enhancement,244118
bug,240687
security vulnerability,53086
question,34833
feature,20745
documentation,17938
Bug,12282
help wanted,10875
wontfix,7279
greenkeeper,6842


In [5]:
contains_stale = not_priority['labels'].str.contains("stale", case=False, na=False)
not_priority= not_priority[~contains_stale].reset_index(drop=True)
not_priority.reset_index(drop=True, inplace=True)
not_priority.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
enhancement,244118
bug,240687
security vulnerability,53086
question,34833
feature,20745
documentation,17938
Bug,12282
help wanted,10875
wontfix,7279
greenkeeper,6842


In [6]:
#Give each priority a label by number.
# 'Label encoding'. Makes is easier for machine learning models to work with categorical data.
low_priority["labels"] = 0
low_priority["class"] = "low_priority"
not_priority["labels"] = 1
not_priority["class"] = "not_priority"

In [7]:
priority = pd.concat([low_priority, not_priority] , ignore_index = True)

In [8]:
priority[priority["repo"] == "python/mypy"]

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class
220467,1.142706e+10,IssuesEvent,2020-02-03 23:33:27,python/mypy,https://api.github.com/repos/python/mypy,closed,Type of conditional expression is object,1,The type of the following conditional expressi...,not_priority
220496,1.142707e+10,IssuesEvent,2020-02-03 23:35:24,python/mypy,https://api.github.com/repos/python/mypy,closed,Type inference of Tuples returns object instead,1,While interacting with `zip` I encountered a v...,not_priority
235965,1.362788e+10,IssuesEvent,2020-09-24 13:13:03,python/mypy,https://api.github.com/repos/python/mypy,opened,Daemon support for --follow-imports=silent,1,"After #5870 is done, it would be nice to also ...",not_priority
244482,1.363241e+10,IssuesEvent,2020-09-24 19:36:27,python/mypy,https://api.github.com/repos/python/mypy,opened,regression: assignment of 'builtins.type' now ...,1,\r\n**Bug Report**\r\n\r\n#7963 causes a serio...,not_priority
262014,2.407484e+10,IssuesEvent,2022-09-18 17:03:35,python/mypy,https://api.github.com/repos/python/mypy,opened,stubtest: more concise error for forgotten arg...,1,**Feature**\r\n\r\nWhen the implementation add...,not_priority
...,...,...,...,...,...,...,...,...,...,...
2143186,7.349652e+09,IssuesEvent,2018-03-08 11:27:27,python/mypy,https://api.github.com/repos/python/mypy,closed,Crash when serializing property with forward r...,1,This serialization test case fails with a cras...,not_priority
2143962,7.349972e+09,IssuesEvent,2018-03-08 12:44:13,python/mypy,https://api.github.com/repos/python/mypy,opened,Suggest type annotations for partial types spa...,1,We don't support inferring partial types from ...,not_priority
2143975,7.349979e+09,IssuesEvent,2018-03-08 12:45:41,python/mypy,https://api.github.com/repos/python/mypy,closed,What to do about inferring types from multiple...,1,Typically mypy infers the type of the variable...,not_priority
2148728,1.785710e+10,IssuesEvent,2021-09-05 08:41:21,python/mypy,https://api.github.com/repos/python/mypy,opened,Name already defined error with warlus operato...,1,"Invalid error for mypy: Name ""x"" already defin...",not_priority


In [9]:
# Remove mypy from the dataset
priority = priority[priority["repo"] != "python/mypy"]
priority

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class
0,1.141755e+10,IssuesEvent,2020-02-03 00:02:50,poissonconsulting/term,https://api.github.com/repos/poissonconsulting...,opened,Document levels of validation in vld_term(),0,As this is what other functions reference.,low_priority
1,1.141766e+10,IssuesEvent,2020-02-03 00:46:42,WordPress/gutenberg,https://api.github.com/repos/WordPress/gutenberg,closed,Z-index issue using Classic block,0,Selecting a link in the classic block and havi...,low_priority
2,1.141777e+10,IssuesEvent,2020-02-03 01:25:06,MarkBind/markbind,https://api.github.com/repos/MarkBind/markbind,reopened,Indicate the language of a code block,0,**Tell us about your environment**\r\n\r\n* **...,low_priority
3,1.141780e+10,IssuesEvent,2020-02-03 01:34:56,MarkBind/markbind,https://api.github.com/repos/MarkBind/markbind,closed,"Use one attribute name for header, title, name",0,May be better to choose one of the three and u...,low_priority
4,1.141786e+10,IssuesEvent,2020-02-03 01:52:04,poissonconsulting/term,https://api.github.com/repos/poissonconsulting...,closed,Document levels of validation in vld_term(),0,As this is what other functions reference.,low_priority
...,...,...,...,...,...,...,...,...,...,...
2162255,8.109961e+09,IssuesEvent,2018-08-14 09:21:15,urbit/arvo,https://api.github.com/repos/urbit/arvo,opened,%bad-text trips up hall JSON conversion,1,"Haven't tested this in detail yet, but pretty ...",not_priority
2162256,8.109961e+09,IssuesEvent,2018-08-14 09:21:16,highcharts/highcharts-react,https://api.github.com/repos/highcharts/highch...,closed,HighMaps mapBubble type,1,"Hello,\r\nIt is possible to create also HighMa...",not_priority
2162257,8.109982e+09,IssuesEvent,2018-08-14 09:25:07,Loriowar/comindivion,https://api.github.com/repos/Loriowar/comindivion,opened,Add a validation on a belonging of a predicate...,1,Now user can change html content of the intera...,not_priority
2162258,8.109983e+09,IssuesEvent,2018-08-14 09:25:24,TiiQu-Network/TQ-test-page,https://api.github.com/repos/TiiQu-Network/TQ-...,closed,Error on submit with no values,1,TypeError: Too few arguments in function sum (...,not_priority


In [10]:
# Drop duplicates by the content of the title
priority = priority.drop_duplicates(subset=['title'], keep='last')
priority.dropna(inplace=True)
priority.reset_index(inplace=True)
priority.drop(columns=["index"] , inplace= True)
priority["class"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority.drop(columns=["index"] , inplace= True)


class
not_priority    1348848
low_priority     140869
Name: count, dtype: int64

In [11]:
# Split the dataframe 
low_priority = priority.loc[priority['class'] == 'low_priority'].copy()
not_priority = priority.loc[priority['class'] == 'not_priority'].copy()

In [12]:
low_label_counts = low_priority["class"].value_counts()
not_label_counts = not_priority["class"].value_counts()
lp_count = low_label_counts["low_priority"]
print(f"low count: {lp_count}")
not_count = not_label_counts["not_priority"]
print(f"not count: {not_count}")

low count: 140869
not count: 1348848


In [13]:
not_priority = not_priority.sample(frac=lp_count/not_count, random_state=42)
low_vs_not = pd.concat([low_priority, not_priority] , ignore_index = True)
low_vs_not["class"].value_counts()

class
low_priority    140869
not_priority    140869
Name: count, dtype: int64

In [14]:
print(low_vs_not["title"][0])
print(low_vs_not["body"][0])

Z-index issue using Classic block
Selecting a link in the classic block and having it active then selecting another link in the left sidebar it looked like this:

![screen shot 2018-05-31 at 18 37 33](https://user-images.githubusercontent.com/5323259/40795551-b05f6796-6502-11e8-8510-65198a4cf7df.png)



In [15]:
# Copy content of body to a new col named text
low_vs_not["text"] = low_vs_not["title"] + low_vs_not["body"]
low_vs_not.tail()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class,text
281733,9413568000.0,IssuesEvent,2019-04-10 08:08:18,IPVS-AS/MBP,https://api.github.com/repos/IPVS-AS/MBP,closed,Monitoring: Allow the user to define abnormal ...,1,The user should be able to define situations i...,not_priority,Monitoring: Allow the user to define abnormal ...
281734,8815382000.0,IssuesEvent,2018-12-29 17:51:35,BITPlan/docker-semanticmediawiki,https://api.github.com/repos/BITPlan/docker-se...,closed,RUN /bin/bash /docker-entrypoint.sh -smw step ...,1,I get the following error at Step 12:\n\nStep ...,not_priority,RUN /bin/bash /docker-entrypoint.sh -smw step ...
281735,13097060000.0,IssuesEvent,2020-08-03 16:45:20,gbif/portal-feedback,https://api.github.com/repos/gbif/portal-feedback,opened,Null links on GrSciColl pagination in other la...,1,"The first-page on GrSciColl institutions, coll...",not_priority,Null links on GrSciColl pagination in other la...
281736,19981560000.0,IssuesEvent,2022-01-30 00:55:08,jordanlambrecht/pb-oct-2022,https://api.github.com/repos/jordanlambrecht/p...,closed,TODO: Migrate to Strapi (or wordpress idk),1,I'm on the fence about whether we should use S...,not_priority,TODO: Migrate to Strapi (or wordpress idk)I'm ...
281737,9782555000.0,IssuesEvent,2019-06-08 00:29:21,xiaospider/test,https://api.github.com/repos/xiaospider/test,closed,Found a bug2019-06-08T00:29:20.032Z,1,I'm having a problem with this.,not_priority,Found a bug2019-06-08T00:29:20.032ZI'm having ...


In [16]:
low_vs_not["text"][0]

'Z-index issue using Classic blockSelecting a link in the classic block and having it active then selecting another link in the left sidebar it looked like this:\r\n\r\n![screen shot 2018-05-31 at 18 37 33](https://user-images.githubusercontent.com/5323259/40795551-b05f6796-6502-11e8-8510-65198a4cf7df.png)\r\n'

In [17]:
# Make a new dataframe with only text, label and class cols.
all_priority_subset = low_vs_not[["text" , "labels" , "class"]]
all_priority_subset

Unnamed: 0,text,labels,class
0,Z-index issue using Classic blockSelecting a l...,0,low_priority
1,Indicate the language of a code block**Tell us...,0,low_priority
2,"Use one attribute name for header, title, name...",0,low_priority
3,Document levels of validation in vld_term()As ...,0,low_priority
4,Support puml files in boilerplatessee [this co...,0,low_priority
...,...,...,...
281733,Monitoring: Allow the user to define abnormal ...,1,not_priority
281734,RUN /bin/bash /docker-entrypoint.sh -smw step ...,1,not_priority
281735,Null links on GrSciColl pagination in other la...,1,not_priority
281736,TODO: Migrate to Strapi (or wordpress idk)I'm ...,1,not_priority


In [18]:
# Convert to string
all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)


In [19]:
all_priority_subset

Unnamed: 0,text,labels,class,text_str
0,Z-index issue using Classic blockSelecting a l...,0,low_priority,Z-index issue using Classic blockSelecting a l...
1,Indicate the language of a code block**Tell us...,0,low_priority,Indicate the language of a code block**Tell us...
2,"Use one attribute name for header, title, name...",0,low_priority,"Use one attribute name for header, title, name..."
3,Document levels of validation in vld_term()As ...,0,low_priority,Document levels of validation in vld_term()As ...
4,Support puml files in boilerplatessee [this co...,0,low_priority,Support puml files in boilerplatessee [this co...
...,...,...,...,...
281733,Monitoring: Allow the user to define abnormal ...,1,not_priority,Monitoring: Allow the user to define abnormal ...
281734,RUN /bin/bash /docker-entrypoint.sh -smw step ...,1,not_priority,RUN /bin/bash /docker-entrypoint.sh -smw step ...
281735,Null links on GrSciColl pagination in other la...,1,not_priority,Null links on GrSciColl pagination in other la...
281736,TODO: Migrate to Strapi (or wordpress idk)I'm ...,1,not_priority,TODO: Migrate to Strapi (or wordpress idk)I'm ...


In [20]:
# Clean the data.
all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)


In [21]:
# Make a subset with text_clean and label
priority_label_text = all_priority_subset[["text_clean" , "labels", "class"]]
priority_label_text

Unnamed: 0,text_clean,labels,class
0,zindex issue using classic blockselecting a li...,0,low_priority
1,indicate the language of a code blocktell us a...,0,low_priority
2,use one attribute name for header title namema...,0,low_priority
3,document levels of validation in vldtermas thi...,0,low_priority
4,support puml files in boilerplatessee this com...,0,low_priority
...,...,...,...
281733,monitoring allow the user to define abnormal s...,1,not_priority
281734,run binbash dockerentrypointsh smw step is fai...,1,not_priority
281735,null links on grscicoll pagination in other la...,1,not_priority
281736,todo migrate to strapi or wordpress idkim on t...,1,not_priority


In [22]:
# Need to dropna here since cleaning function returns NaN for not english text.
priority_label_text.dropna(inplace=True)
priority_label_text.reset_index(inplace=True)
priority_label_text.drop(columns=["index"] , inplace= True)

priority_label_text

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.drop(columns=["index"] , inplace= True)


Unnamed: 0,text_clean,labels,class
0,zindex issue using classic blockselecting a li...,0,low_priority
1,indicate the language of a code blocktell us a...,0,low_priority
2,use one attribute name for header title namema...,0,low_priority
3,document levels of validation in vldtermas thi...,0,low_priority
4,support puml files in boilerplatessee this com...,0,low_priority
...,...,...,...
258009,monitoring allow the user to define abnormal s...,1,not_priority
258010,run binbash dockerentrypointsh smw step is fai...,1,not_priority
258011,null links on grscicoll pagination in other la...,1,not_priority
258012,todo migrate to strapi or wordpress idkim on t...,1,not_priority


In [23]:
# Splitting the DataFrame
test_df = priority_label_text.sample(frac=0.05, random_state=1)  # Select 5% of the data
priority_df = priority_label_text.drop(test_df.index) 
priority_df.reset_index(drop=True, inplace=True)
priority_df

Unnamed: 0,text_clean,labels,class
0,zindex issue using classic blockselecting a li...,0,low_priority
1,indicate the language of a code blocktell us a...,0,low_priority
2,use one attribute name for header title namema...,0,low_priority
3,document levels of validation in vldtermas thi...,0,low_priority
4,support puml files in boilerplatessee this com...,0,low_priority
...,...,...,...
245108,monitoring allow the user to define abnormal s...,1,not_priority
245109,run binbash dockerentrypointsh smw step is fai...,1,not_priority
245110,null links on grscicoll pagination in other la...,1,not_priority
245111,todo migrate to strapi or wordpress idkim on t...,1,not_priority


In [24]:
test_file_name = f"csv/clean_test_low_vs_random.csv"
test_df.to_csv(test_file_name, index=False)

In [25]:
# Clean dataset with clean text and labels.
# 0 = high priority, 1 = not high priority
file_name = f"csv/clean_low_vs_random.csv"
priority_df.to_csv(file_name, index=False)

In [26]:
pri = pd.read_csv(file_name)
pri

Unnamed: 0,text_clean,labels,class
0,zindex issue using classic blockselecting a li...,0,low_priority
1,indicate the language of a code blocktell us a...,0,low_priority
2,use one attribute name for header title namema...,0,low_priority
3,document levels of validation in vldtermas thi...,0,low_priority
4,support puml files in boilerplatessee this com...,0,low_priority
...,...,...,...
245108,monitoring allow the user to define abnormal s...,1,not_priority
245109,run binbash dockerentrypointsh smw step is fai...,1,not_priority
245110,null links on grscicoll pagination in other la...,1,not_priority
245111,todo migrate to strapi or wordpress idkim on t...,1,not_priority


In [27]:
test_df = pd.read_csv(test_file_name)
test_df

Unnamed: 0,text_clean,labels,class
0,login with microsoft user story as an editor i...,1,not_priority
1,06iileeのblog is downin 7a68938httpsgithubcomi...,1,not_priority
2,opening supportfrom lobaishttpscodegooglecomul...,0,low_priority
3,create implementation plan for enhanced datagr...,0,low_priority
4,right click on list view able to right click o...,1,not_priority
...,...,...,...
12896,add browser methods to runtimeit would be grea...,1,not_priority
12897,login page form icons not shownthe code makes ...,0,low_priority
12898,regressing timestampsbug report a transactions...,1,not_priority
12899,fluvio cluster releases list should hide devel...,0,low_priority
