# Data cleaning
We want the class with labels as numerical value and the body with clean text.

This will remove:
* duplicates
* NaN entires
* non english
* url, html

* make it lowercase
* combine title and body

In [None]:
import pandas as pd
import sys
sys.path.append("../../../scripts_shared/")
from preprocess_text import preprocess_text


In [None]:
# Read CSV into a dataframe
filename = "csv/flutter_testset.csv"
df = pd.read_csv(filename)
df

In [None]:
# Number or different labels
df.labels.value_counts().to_frame()[:50]

In [None]:
# Split data based on labels. Contains P0 and P1 in one dataframe, the rest in another

pattern = 'P(1|0)'
# Check if 'labels' contains the pattern
hp = df[df['labels'].str.contains(pattern)]
# Reset index
hp = hp.reset_index(drop=True)
hp


In [None]:
# Remove pattern from df
random = df[~df['labels'].str.contains(pattern)]
random = random.reset_index(drop=True)
random

In [None]:
# Removing in triage since these issues are not assigned a priority
pattern = 'in triage'
# Remove pattern from df
random = random[~random['labels'].str.contains(pattern)]
random = random.reset_index(drop=True)
random

In [None]:
random.labels.value_counts().to_frame()[:50]

In [None]:
# Number or different labels
hp.labels.value_counts().to_frame()[:50]

In [None]:
#Give each priority a label by number.
# 'Label encoding'. Makes is easier for machine learning models to work with categorical data.
hp["label"] = 1
hp["class"] = "high_priority"
hp.head()

In [None]:
random["label"] = 0
random["class"] = "not_high_priority" 
random.head()

In [None]:
# Drop duplicates by the content of the title
high_priority = hp.drop_duplicates(subset=['title'], keep='last')
high_priority.dropna(inplace=True)
high_priority.reset_index(inplace=True)
high_priority.drop(columns=["index"] , inplace= True)
high_priority["class"].value_counts()

In [None]:
# Drop duplicates by the content of the title
not_high_priority = random.drop_duplicates(subset=['title'], keep='last')
not_high_priority.dropna(inplace=True)
not_high_priority.reset_index(inplace=True)
not_high_priority.drop(columns=["index"] , inplace= True)
not_high_priority["class"].value_counts()

In [None]:
high_priority.info()

In [None]:
label_counts = high_priority["class"].value_counts()
label_counts_nhp = not_high_priority["class"].value_counts()
print(label_counts)
not_high_priority_count = label_counts_nhp["not_high_priority"]
print(not_high_priority_count)
hp_count = label_counts["high_priority"]
hp_count

In [None]:
not_high_priority = not_high_priority.sample(frac=hp_count/not_high_priority_count, random_state=42)
not_high_priority

In [None]:
all_priority = pd.concat([high_priority,not_high_priority] , ignore_index = True)
all_priority.tail()

In [None]:
# 
all_priority["label"].value_counts()

In [None]:
print(all_priority["title"][0])
print(all_priority["body"][0])

In [None]:
# Copy content of body to a new col named text
all_priority["text"] = all_priority["title"] + " " + all_priority["body"]
all_priority.tail()

In [None]:
all_priority["text"][0]

In [None]:
# Make a new dataframe with only text, label and class cols.
all_priority_subset = all_priority[["text" , "label" , "class"]]
all_priority_subset

In [None]:
# Convert to string
all_priority_subset["text_str"] = all_priority_subset['text'].astype(str)

In [None]:
all_priority_subset

In [None]:
# Clean the data.
all_priority_subset["text_clean"] = all_priority_subset["text_str"].map(preprocess_text)

In [None]:
# Make a subset with text_clean and label
priority_label_text = all_priority_subset[["text_clean" , "label"]]
priority_label_text

In [None]:
# Need to dropna here since cleaning function returns NaN for not english text.
priority_label_text.dropna(inplace=True)
priority_label_text.reset_index(inplace=True)
priority_label_text.drop(columns=["index"] , inplace= True)

priority_label_text

In [None]:
from sklearn.utils import resample

# Split the dataset into two based on the label
df_majority = priority_label_text[priority_label_text['label'] == 1]
df_minority = priority_label_text[priority_label_text['label'] == 0]

# Undersample the majority class
df_majority_undersampled = resample(df_majority, 
                                     replace=False,    # sample without replacement
                                     n_samples=len(df_minority),     # to match minority class
                                     random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset to avoid any ordering bias
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
priority_label_text = df_balanced
priority_label_text["label"].value_counts()

In [None]:
# Clean dataset with clean text and labels.
# 1 = high priority, 0 = not high priority
file_name = f"csv/clean_flutter_testset.csv"
priority_label_text.to_csv(file_name, index=False)

In [None]:
pri = pd.read_csv(file_name)
pri