# Data cleaning
We want the class with labels as numerical value and the body with clean text.

This will remove:
* duplicates
* NaN entires
* non english
* url, html

* make it lowercase
* combine title and body

In [None]:
import pandas as pd
import sys
import numpy as np
sys.path.append("../../scripts_shared/")
from preprocess_text import preprocess_text


In [None]:
# Read CSV into a dataframe
high_priority = pd.read_csv("csv/high_priority_with_td.csv", index_col=0)
not_high_priority = pd.read_csv("csv/not_high_no_td1.csv", index_col=0)

In [None]:
contains_stale = not_high_priority['labels'].str.contains("stale", case=False, na=False)
not_high_priority= not_high_priority[~contains_stale].reset_index(drop=True)
not_high_priority.reset_index(drop=True, inplace=True)
not_high_priority

In [None]:
# Number or different labels
high_priority.labels.value_counts().to_frame()[:50]

In [None]:
high_priority[high_priority["repo"] == "python/mypy"]

In [None]:
# Remove mypy from the dataset
high_priority = high_priority[high_priority["repo"] != "python/mypy"]
high_priority

In [None]:
not_high_priority[not_high_priority["repo"] == "python/mypy"]

In [None]:
# Remove mypy from the dataset
not_high_priority = not_high_priority[not_high_priority["repo"] != "python/mypy"]
not_high_priority

In [None]:
# Number or different labels
not_high_priority.labels.value_counts().to_frame()[:50]

In [None]:
#Give each priority a label by number.
# 'Label encoding'. Makes is easier for machine learning models to work with categorical data.
high_priority["label"] = 0
high_priority["class"] = "high_priority"
high_priority.head()

In [None]:
not_high_priority["label"] = 1
not_high_priority["class"] = "not_high_priority" 
not_high_priority.head()

In [None]:
# Drop duplicates by the content of the body
high_priority = high_priority.drop_duplicates(subset=['title'], keep='last')
high_priority.dropna(inplace=True)
high_priority.reset_index(inplace=True)
high_priority.drop(columns=["index"] , inplace= True)
high_priority["class"].value_counts()

In [None]:
# Drop duplicates by the content of the body
not_high_priority = not_high_priority.drop_duplicates(subset=['title'], keep='last')
not_high_priority.dropna(inplace=True)
not_high_priority.reset_index(inplace=True)
not_high_priority.drop(columns=["index"] , inplace= True)
not_high_priority["class"].value_counts()

In [None]:
high_priority.info()

In [None]:
label_counts = high_priority["class"].value_counts()
label_counts_nhp = not_high_priority["class"].value_counts()
print(label_counts)
not_high_priority_count = label_counts_nhp["not_high_priority"]
print(not_high_priority_count)
hp_count = label_counts["high_priority"]
hp_count

In [None]:
not_high_priority = not_high_priority.sample(frac=hp_count/not_high_priority_count, random_state=42)
not_high_priority

In [None]:
all_priority = pd.concat([high_priority,not_high_priority] , ignore_index = True)
all_priority.tail()

In [None]:
# Does this need to be more balanced?
all_priority["label"].value_counts()

In [None]:
print(all_priority["title"][0])
print(all_priority["body"][0])

In [None]:
# Copy content of body to a new col named text
all_priority["text"] = all_priority["title"] + all_priority["body"]
all_priority.tail()

In [None]:
all_priority["text"][0]

In [None]:
# Make a new dataframe with only text, label and class cols.
all_priority_subset = all_priority[["text" , "label" , "class"]]
all_priority_subset

In [None]:
# Convert to string
all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)

In [None]:
all_priority_subset

In [None]:
# Clean the data.
all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)

In [None]:
# Make a subset with text_clean and label
priority_label_text = all_priority_subset[["text_clean" , "label"]]
priority_label_text

In [None]:
# Need to dropna here since cleaning function returns NaN for not english text.
priority_label_text.dropna(inplace=True)
priority_label_text.reset_index(inplace=True)
priority_label_text.drop(columns=["index"] , inplace= True)

priority_label_text

In [None]:
# Clean dataset with clean text and labels.
# 0 = high priority, 1 = not high priority
priority_label_text.to_csv("csv/clean_priority_debt_high_or_not_high.csv")

In [73]:
pri = pd.read_csv("csv/clean_priority_debt_high_or_not_high.csv", index_col=0)
pri

Unnamed: 0,text_clean,label
0,remove step from platform orientation template...,0
1,upgrade to latest appenginepluginscorethe late...,0
2,implement file system based throttlerwill nip ...,0
3,finish search v2 querystring custom extension ...,0
4,update versions fix all audit warningslast ver...,0
...,...,...
955,unify standard names structuresee httpsgithubc...,1
956,mouse move may have wrong coordinateswhen no m...,1
957,using colons in title field of frontmatterusin...,1
958,config file watcher on network mount wont reco...,1


In [74]:
# Split dataframe into three parts: training, validation and testing.
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=42):
    np.random.seed(seed)
    # Shuffle index of dataframe
    perm = np.random.permutation(df.index)
    
    df_length = len(df.index)
    
    # Number of row in training set
    train_end = int(train_percent * df_length)
    # Number of rows in validate set
    validate_end = int(validate_percent * df_length) + train_end
    
    # From start to train end
    train = df.iloc[perm[:train_end]]
    # From train_end to validate_end
    validate = df.iloc[perm[train_end:validate_end]]
    # From validate to the last row in dataframe.
    test = df.iloc[perm[validate_end:]]
    return train, validate, test


In [75]:
train , validate , test = train_validate_test_split(priority_label_text)

In [77]:
from datasets import Dataset, DatasetDict
# Convert from Pandas DataFrame to Hugging Face datasets
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(validate)
test_ds = Dataset.from_pandas(test)

ds = DatasetDict()

ds["test"] = test_ds
ds["train"] = tds
ds["validate"] = vds

ds

DatasetDict({
    test: Dataset({
        features: ['text_clean', 'label', '__index_level_0__'],
        num_rows: 96
    })
    train: Dataset({
        features: ['text_clean', 'label', '__index_level_0__'],
        num_rows: 768
    })
    validate: Dataset({
        features: ['text_clean', 'label', '__index_level_0__'],
        num_rows: 96
    })
})

In [78]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import Dataset

# Convert the DataFrame
hf_dataset = Dataset.from_pandas(priority_label_text)
hf_dataset

In [80]:
ds.push_to_hub("high_priority_or_not_high_1")


Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00, 33554.43it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00, 32263.88it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00, 32263.88it/s]
Downloading metadata: 100%|██████████| 737/737 [00:00<00:00, 8.76MB/s]
