# Data cleaning
We want the class with labels as numerical value and the body with clean text.

This will remove:
* duplicates
* NaN entires
* non english
* url, html

* make it lowercase
* combine title and body

In [1]:
import pandas as pd
import sys
sys.path.append("../../../scripts_shared/")
from preprocess_text import preprocess_text


In [2]:
file_name = "test_sets_projects.csv"
df = pd.read_csv(file_name)
df

Unnamed: 0,priority,description,project,labels,issuetype,collection
0,Low,some errors show up as shown in the screenshot...,Sourcetree for Windows,[],Bug,Jira
1,Low,I have been using Sourcetree 3.4.4. We use cu...,Sourcetree for Windows,[],Bug,Jira
2,Low,After installing SourceTree for Windows 10 64b...,Sourcetree for Windows,[],Bug,Jira
3,Low,"On windows, Sourcetree.exe will start ""git.exe...",Sourcetree for Windows,[],Bug,Jira
4,Low,"Hello,\r\n\r\nSourceTree 3.4.7.\r\n\r\nOS: Win...",Sourcetree for Windows,[],Bug,Jira
...,...,...,...,...,...,...
386200,1 - Blocker,I am attempting to to follow the guide found h...,Artifactory Binary Repository,[],Bug,JFrog
386201,4 - Normal,"In binarystore.xml, maxCacheSize is in bytes b...",Artifactory Binary Repository,[],Bug,JFrog
386202,4 - Normal,{color:#000000}We are using an artifact(folder...,Artifactory Binary Repository,[],New Feature,JFrog
386203,4 - Normal,Remote repositories created with the repo name...,Artifactory Binary Repository,[],Bug,JFrog


In [3]:
# Count per priority
df['class'].value_counts()

KeyError: 'class'

In [None]:
df['issuetype'].value_counts().to_frame()[:50]

In [None]:
# Unique projects
df['project'].nunique()


In [None]:
# Unique collections
df['collection'].nunique()

In [None]:
# Count per collection
df['collection'].value_counts().to_frame()

In [None]:
# Drop duplicates by the content of the description
df = df.drop_duplicates(subset=['description'], keep='last')
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.drop(columns=["index"] , inplace= True)
df["class"].value_counts()

In [None]:
print(df["description"][0])

In [None]:
# Convert to string
df["text_str"] = df['description'].astype(str)

In [None]:
# Clean the data.
df["text_clean"] = df["text_str"].map(preprocess_text)

In [None]:
# save df to csv
df.to_csv("jira_clean_with_all_cols.csv", index=False)

In [None]:
# Extract only cols need
df = df[["class", "text_clean"]]

In [None]:
# Row with NaN
df[df.isna().any(axis=1)]

In [None]:
# Need to dropna here since cleaning function returns NaN for not english text.
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)

df

In [None]:
# Rows with NaN
df[df.isna().any(axis=1)]

In [None]:
null_rows = df[df['text_clean'].isnull()]
null_rows

In [None]:
# Clean dataset with clean text and class.
name = f"jira_clean.csv"
df.to_csv(name, index=False)

In [None]:
import os
priority_levels = ['Highest', 'High', 'Medium', 'Low', 'Lowest']

for level in priority_levels:
    try:
        # Make dir with level
        os.makedirs(f'{level}', exist_ok=True)
        # df with level class
        df_level = df[df['class'] == level]
        # Save to csv
        df_level.to_csv(f'{level}/clean_{level}.csv', index=False)
        print(f"Saved {level}.csv")
    except Exception as e:
        print(f"An error occurred for level {level}: {str(e)}")

In [None]:
# Read csv to check if file is saved correctly
for level in priority_levels:
    try:
        df = pd.read_csv(f'{level}/clean_{level}.csv')
        print(f"Read {level}.csv")
    except Exception as e:
        print(f"An error occurred while reading {level}.csv: {str(e)}")

In [None]:
pri = pd.read_csv(name)
pri

In [None]:
null_rows = pri[pri['text_clean'].isnull()]
null_rows

In [None]:
# Remove rows with NaN
pri = pri.dropna()
# Reset index
pri.reset_index(drop=True, inplace=True)
pri