# Create dataset: Highest vs rest (high, medium, low, lowest)
* Under sampling the majority class.
* 50/50 balance per class. 

In [None]:
import pandas as pd
# Import csv file
file_name = "clean_Jira"
df = pd.read_csv(f'../csv/{file_name}.csv')
df

In [None]:
# Count class
df['priority'].value_counts().to_frame()

In [None]:
# NaN values
df.isnull().sum()

In [None]:
# Drop NaN values
df = df.dropna()
df.isnull().sum()

In [None]:
# Count class
df['priority'].value_counts().to_frame()

In [None]:
# Unique collections
df['collection'].unique()

In [None]:
# Unique projects
df['project'].unique().tolist()

In [None]:
# Number of unique projects 
len(df['project'].unique())


In [None]:
priority_mapping = {
    # Highest priority
    'Highest': 'Highest',
    'High': 'High',
    'Medium': 'Medium',
    'Low': 'Low',

}
# Apply mapping
df['class'] = df['priority'].map(priority_mapping)

In [None]:
# Find the priorities that are not mapped to 'Highest' by checking for nulls in 'class'
unmapped_priorities = df[df['class'].isnull()]['priority'].unique()

if len(unmapped_priorities)  == 0:
    print("All priorities are mapped to a new class")
else:
    print("Priorities not mapped to a new class:")
    for priority in unmapped_priorities:
        print(f"- {priority}")

In [None]:
# Count class
df['class'].value_counts().to_frame()

In [None]:
highest = df[df['class'] == 'Highest']
highest["label"] = 1
highest["class_original"] = "Highest"
highest["class"] = "highest"
high = df[df['class'] == 'High']
medium = df[df['class'] == 'Medium']
low = df[df['class'] == 'Low']

In [None]:
# Concat all class except highest
rest = pd.concat([high, medium, low])
# Rename class to class_1
rest = rest.rename(columns={'class': 'class_original'})
rest["class"] = "rest" 
rest["label"] = 0
rest 

In [None]:
# Value count rest class
rest['class'].value_counts().to_frame()

In [None]:
from sklearn.utils import shuffle
# Number of rows in the highest class
n_highest = highest.shape[0]

rest_sampled = rest.sample(n=n_highest, random_state=42)  


balanced_data = pd.concat([highest, rest_sampled])

# Shuffle the combined dataset
balanced_data = shuffle(balanced_data, random_state=42)
balanced_data


In [None]:
balanced_data['class'].value_counts().to_frame()

In [None]:
balanced_data

In [None]:
# Count label
balanced_data['label'].value_counts().to_frame()

In [None]:
# Count original class
balanced_data['class_original'].value_counts().to_frame()

In [None]:
# New df with only text clean and label
balanced_data = balanced_data[['text_clean', 'label']]
balanced_data

In [None]:
# Save to csv
full_file_name = f'../csv/{file_name}_balanced.csv'
balanced_data.to_csv(full_file_name, index=False)

In [None]:
# Read csv
df = pd.read_csv(full_file_name)
df