# Data cleaning
We want the class with labels as numerical value and the body with clean text.

This will remove:
* duplicates
* NaN entires
* non english
* url, html

* make it lowercase
* combine title and body

In [1]:
import pandas as pd
import sys
import numpy as np
sys.path.append("../../scripts_shared/")
from preprocess_text import preprocess_text


In [2]:
# Read CSV into a dataframe
high_priority = pd.read_csv("csv/high_priority_no_td.csv", index_col=0)
medium_priority = pd.read_csv("csv/medium_priority_no_td.csv", index_col=0)
low_priority = pd.read_csv("csv/low_priority_no_td.csv", index_col=0)

In [3]:
# Number or different labels
high_priority.labels.value_counts().to_frame()[:50]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
bug high priority,9893
high priority,9835
High Priority,7104
enhancement high priority,5226
Priority: High,4480
priority.High type.Task,3513
priority.High,3509
priority.high type.task,3026
priority.High type.Story,2838
priority.high,2609


In [4]:
#Give each priority a label by number.
# 'Label encoding'. Makes is easier for machine learning models to work with categorical data.
high_priority["labels"] = 0
high_priority["class"] = "high_priority"
medium_priority["labels"] = 1
medium_priority["class"] = "medium_priority"
low_priority["labels"] = 2
low_priority["class"] = "low_priority"
high_priority.head()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class
0,11417540000.0,IssuesEvent,2020-02-03 00:00:44,unitystation/unitystation,https://api.github.com/repos/unitystation/unit...,closed,Client breaking NRE when using edit field on C...,0,### Bug:\r\n\r\nIf you use the edit field of t...,high_priority
1,11417540000.0,IssuesEvent,2020-02-03 00:01:26,zowe/sample-spring-boot-api-service,https://api.github.com/repos/zowe/sample-sprin...,closed,The SDK provides a separate Java (no-Spring) l...,0,- The commons-spring library is split into:\r\...,high_priority
2,11417550000.0,IssuesEvent,2020-02-03 00:02:58,openmsupply/mobile,https://api.github.com/repos/openmsupply/mobile,closed,Auto-log out after some time frame,0,## Is your feature request related to a proble...,high_priority
3,11417550000.0,IssuesEvent,2020-02-03 00:04:18,UltimateCodeMonkeys/CodeMonkeysMVVM,https://api.github.com/repos/UltimateCodeMonke...,opened,Migrate: CodeMonkeys ViewModelNavigationServic...,0,Migrate the Xamarin.Forms navigation service i...,high_priority
4,11417560000.0,IssuesEvent,2020-02-03 00:08:03,wordpress-mobile/WordPress-Android,https://api.github.com/repos/wordpress-mobile/...,closed,IA Reader filter bottom sheet: manage untitled...,0,In the filter bottom sheet we introduced in th...,high_priority


In [5]:
priority = pd.concat([high_priority, medium_priority, low_priority] , ignore_index = True)

In [6]:
priority[priority["repo"] == "python/mypy"]

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class


In [7]:
# Remove mypy from the dataset
priority = priority[priority["repo"] != "python/mypy"]
priority

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class
0,1.141754e+10,IssuesEvent,2020-02-03 00:00:44,unitystation/unitystation,https://api.github.com/repos/unitystation/unit...,closed,Client breaking NRE when using edit field on C...,0,### Bug:\r\n\r\nIf you use the edit field of t...,high_priority
1,1.141754e+10,IssuesEvent,2020-02-03 00:01:26,zowe/sample-spring-boot-api-service,https://api.github.com/repos/zowe/sample-sprin...,closed,The SDK provides a separate Java (no-Spring) l...,0,- The commons-spring library is split into:\r\...,high_priority
2,1.141755e+10,IssuesEvent,2020-02-03 00:02:58,openmsupply/mobile,https://api.github.com/repos/openmsupply/mobile,closed,Auto-log out after some time frame,0,## Is your feature request related to a proble...,high_priority
3,1.141755e+10,IssuesEvent,2020-02-03 00:04:18,UltimateCodeMonkeys/CodeMonkeysMVVM,https://api.github.com/repos/UltimateCodeMonke...,opened,Migrate: CodeMonkeys ViewModelNavigationServic...,0,Migrate the Xamarin.Forms navigation service i...,high_priority
4,1.141756e+10,IssuesEvent,2020-02-03 00:08:03,wordpress-mobile/WordPress-Android,https://api.github.com/repos/wordpress-mobile/...,closed,IA Reader filter bottom sheet: manage untitled...,0,In the filter bottom sheet we introduced in th...,high_priority
...,...,...,...,...,...,...,...,...,...,...
780855,2.060584e+10,IssuesEvent,2022-03-06 23:40:48,CafeteriaGuild/DeepMobLearning-Refabricated,https://api.github.com/repos/CafeteriaGuild/De...,closed,Thunderstrike during Trials destroying the loot.,2,"When you run the trial, it will set areas on f...",low_priority
780856,2.060586e+10,IssuesEvent,2022-03-06 23:45:23,bounswe/bounswe2022group1,https://api.github.com/repos/bounswe/bounswe20...,closed,Editing Navigator of Wiki,2,The navigator of the wiki should be edited and...,low_priority
780857,7.334740e+09,IssuesEvent,2018-03-06 00:10:40,hoodedice/notes,https://api.github.com/repos/hoodedice/notes,opened,Check if passwords match,2,JavaScript code (or see if possible without) t...,low_priority
780858,7.334876e+09,IssuesEvent,2018-03-06 00:53:49,zephyrproject-rtos/zephyr,https://api.github.com/repos/zephyrproject-rto...,closed,Add doc to samples/bluetooth/mesh & samples/bl...,2,We should document what exactly the sample is ...,low_priority


In [8]:
# Drop duplicates by the content of the body
priority = priority.drop_duplicates(subset=['title'], keep='last')
priority.dropna(inplace=True)
priority.reset_index(inplace=True)
priority.drop(columns=["index"] , inplace= True)
priority["class"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority.drop(columns=["index"] , inplace= True)


class
high_priority      227491
medium_priority    154133
low_priority       137143
Name: count, dtype: int64

In [9]:
# Split the dataframe 
high_priority = priority.loc[priority['class'] == 'high_priority'].copy()
medium_priority = priority.loc[priority['class'] == 'medium_priority'].copy()
low_priority = priority.loc[priority['class'] == 'low_priority'].copy()

In [10]:
high_label_counts = high_priority["class"].value_counts()
medium_label_counts = medium_priority["class"].value_counts()
low_label_counts = low_priority["class"].value_counts()
hp_count = high_label_counts["high_priority"]
print(f"hp count: {hp_count}")
mp_count = medium_label_counts["medium_priority"]
print(f"med count: {mp_count}")
lp_count = low_label_counts["low_priority"]
print(f"low count: {lp_count}")

hp count: 227491
med count: 154133
low count: 137143


In [11]:
high_priority = high_priority.sample(frac=mp_count/hp_count, random_state=42)
high_and_med_priority = pd.concat([medium_priority, high_priority] , ignore_index = True)
high_and_med_priority["class"].value_counts()

class
medium_priority    154133
high_priority      154133
Name: count, dtype: int64

In [12]:
high_and_med_priority["labels"] = 1
low_priority["labels"] = 0
high_and_med_priority["class"] = "high_and_low_priority"
high_and_med_priority["class"].value_counts()

class
high_and_low_priority    308266
Name: count, dtype: int64

In [13]:
high_and_low_label_counts = high_and_med_priority["class"].value_counts()
hm_count = high_and_low_label_counts["high_and_low_priority"]
hm_count

308266

In [14]:

high_and_med_priority = high_and_med_priority.sample(frac=lp_count/hm_count, random_state=42)
all_priority = pd.concat([low_priority, high_and_med_priority] , ignore_index = True)
all_priority["class"].value_counts()

class
low_priority             137143
high_and_low_priority    137143
Name: count, dtype: int64

In [15]:
print(all_priority["title"][0])
print(all_priority["body"][0])

Z-index issue using Classic block
Selecting a link in the classic block and having it active then selecting another link in the left sidebar it looked like this:

![screen shot 2018-05-31 at 18 37 33](https://user-images.githubusercontent.com/5323259/40795551-b05f6796-6502-11e8-8510-65198a4cf7df.png)



In [16]:
# Copy content of body to a new col named text
all_priority["text"] = all_priority["title"] + all_priority["body"]
all_priority.tail()

Unnamed: 0,id,type,created_at,repo,repo_url,action,title,labels,body,class,text
274281,9975307000.0,IssuesEvent,2019-07-09 12:48:13,alan-turing-institute/distr6,https://api.github.com/repos/alan-turing-insti...,closed,Mixture distribution support,1,What is the correct way to represent the suppo...,high_and_low_priority,Mixture distribution supportWhat is the correc...
274282,9821946000.0,IssuesEvent,2019-06-14 08:36:53,TimJentzsch/valveGamesAnnouncerBot,https://api.github.com/repos/TimJentzsch/valve...,opened,[FEATURE] Create Telegram Instant View templates,1,**Is your feature request related to a client?...,high_and_low_priority,[FEATURE] Create Telegram Instant View templat...
274283,4927007000.0,IssuesEvent,2016-11-26 13:51:28,smirkspace/smirkspace,https://api.github.com/repos/smirkspace/smirks...,reopened,"""New User"" Guide",1,This will be some description or walkthrough o...,high_and_low_priority,"""New User"" GuideThis will be some description ..."
274284,11258200000.0,IssuesEvent,2020-01-13 03:28:25,cop4934-fall19-group32/Project-32,https://api.github.com/repos/cop4934-fall19-gr...,closed,Puzzle Grading,1,"When the player solution is being run, the con...",high_and_low_priority,Puzzle GradingWhen the player solution is bein...
274285,6965059000.0,IssuesEvent,2017-12-09 01:22:14,tootsuite/mastodon,https://api.github.com/repos/tootsuite/mastodon,closed,Favorites column hides statuses on screen,1,After scrolling down past two pages of favorit...,high_and_low_priority,Favorites column hides statuses on screenAfter...


In [17]:
all_priority["text"][0]

'Z-index issue using Classic blockSelecting a link in the classic block and having it active then selecting another link in the left sidebar it looked like this:\r\n\r\n![screen shot 2018-05-31 at 18 37 33](https://user-images.githubusercontent.com/5323259/40795551-b05f6796-6502-11e8-8510-65198a4cf7df.png)\r\n'

In [18]:
# Make a new dataframe with only text, label and class cols.
all_priority_subset = all_priority[["text" , "labels" , "class"]]
all_priority_subset

Unnamed: 0,text,labels,class
0,Z-index issue using Classic blockSelecting a l...,0,low_priority
1,Indicate the language of a code block**Tell us...,0,low_priority
2,"Use one attribute name for header, title, name...",0,low_priority
3,Document levels of validation in vld_term()As ...,0,low_priority
4,Support puml files in boilerplatessee [this co...,0,low_priority
...,...,...,...
274281,Mixture distribution supportWhat is the correc...,1,high_and_low_priority
274282,[FEATURE] Create Telegram Instant View templat...,1,high_and_low_priority
274283,"""New User"" GuideThis will be some description ...",1,high_and_low_priority
274284,Puzzle GradingWhen the player solution is bein...,1,high_and_low_priority


In [19]:
# Convert to string
all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)


In [20]:
all_priority_subset

Unnamed: 0,text,labels,class,text_str
0,Z-index issue using Classic blockSelecting a l...,0,low_priority,Z-index issue using Classic blockSelecting a l...
1,Indicate the language of a code block**Tell us...,0,low_priority,Indicate the language of a code block**Tell us...
2,"Use one attribute name for header, title, name...",0,low_priority,"Use one attribute name for header, title, name..."
3,Document levels of validation in vld_term()As ...,0,low_priority,Document levels of validation in vld_term()As ...
4,Support puml files in boilerplatessee [this co...,0,low_priority,Support puml files in boilerplatessee [this co...
...,...,...,...,...
274281,Mixture distribution supportWhat is the correc...,1,high_and_low_priority,Mixture distribution supportWhat is the correc...
274282,[FEATURE] Create Telegram Instant View templat...,1,high_and_low_priority,[FEATURE] Create Telegram Instant View templat...
274283,"""New User"" GuideThis will be some description ...",1,high_and_low_priority,"""New User"" GuideThis will be some description ..."
274284,Puzzle GradingWhen the player solution is bein...,1,high_and_low_priority,Puzzle GradingWhen the player solution is bein...


In [21]:
# Clean the data.
all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)


In [22]:
# Make a subset with text_clean and label
priority_label_text = all_priority_subset[["text_clean" , "labels", "class"]]
priority_label_text

Unnamed: 0,text_clean,labels,class
0,zindex issue using classic blockselecting a li...,0,low_priority
1,indicate the language of a code blocktell us a...,0,low_priority
2,use one attribute name for header title namema...,0,low_priority
3,document levels of validation in vldtermas thi...,0,low_priority
4,support puml files in boilerplatessee this com...,0,low_priority
...,...,...,...
274281,mixture distribution supportwhat is the correc...,1,high_and_low_priority
274282,feature create telegram instant view templates...,1,high_and_low_priority
274283,new user guidethis will be some description or...,1,high_and_low_priority
274284,puzzle gradingwhen the player solution is bein...,1,high_and_low_priority


In [23]:
# Need to dropna here since cleaning function returns NaN for not english text.
priority_label_text.dropna(inplace=True)
priority_label_text.reset_index(inplace=True)
priority_label_text.drop(columns=["index"] , inplace= True)

priority_label_text

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_label_text.drop(columns=["index"] , inplace= True)


Unnamed: 0,text_clean,labels,class
0,zindex issue using classic blockselecting a li...,0,low_priority
1,indicate the language of a code blocktell us a...,0,low_priority
2,use one attribute name for header title namema...,0,low_priority
3,document levels of validation in vldtermas thi...,0,low_priority
4,support puml files in boilerplatessee this com...,0,low_priority
...,...,...,...
256815,mixture distribution supportwhat is the correc...,1,high_and_low_priority
256816,feature create telegram instant view templates...,1,high_and_low_priority
256817,new user guidethis will be some description or...,1,high_and_low_priority
256818,puzzle gradingwhen the player solution is bein...,1,high_and_low_priority


In [24]:
# Clean dataset with clean text and labels.
# 0 = high priority, 1 = not high priority
file_name = f"csv/clean_low_vs_high_and_med_priority.csv"
priority_label_text.to_csv(file_name, index=False)

In [25]:
pri = pd.read_csv(file_name)
pri

Unnamed: 0,text_clean,labels,class
0,zindex issue using classic blockselecting a li...,0,low_priority
1,indicate the language of a code blocktell us a...,0,low_priority
2,use one attribute name for header title namema...,0,low_priority
3,document levels of validation in vldtermas thi...,0,low_priority
4,support puml files in boilerplatessee this com...,0,low_priority
...,...,...,...
256815,mixture distribution supportwhat is the correc...,1,high_and_low_priority
256816,feature create telegram instant view templates...,1,high_and_low_priority
256817,new user guidethis will be some description or...,1,high_and_low_priority
256818,puzzle gradingwhen the player solution is bein...,1,high_and_low_priority
