**Sources**:

[Software estimation machine learning article](https://uruit.com/blog/software-estimation-machine-learning/)

[Text preprocessing in NLP beginner guide](https://swatimeena989.medium.com/beginners-guide-for-preprocessing-text-data-f3156bec85ca#47f5)

In [75]:
import pandas as pd
import numpy as np

pretrain_csv = "pretrained-data/talendforge_pretrain.csv"
preprocessed_pretrained_data = "preprocessed-pretrained-data-with-stop-words/talendforge_pretrain.csv"

df = pd.read_csv(pretrain_csv, usecols=['issuekey', 'title', 'description'])

In [76]:
df.isnull().sum()

issuekey       0
title          0
description    0
dtype: int64

In [77]:
df.count()

issuekey       50000
title          50000
description    50000
dtype: int64

In [78]:
df = df.dropna(how='any')

In [79]:
df.count()

issuekey       50000
title          50000
description    50000
dtype: int64

**Data cleanup phase**:

In [80]:
import nltk
from nltk.corpus import stopwords
import re
import unicodedata
import string

def remove_patterns_from_ends(word):
    pattern = r'^[^\w\s]+|[^\w\s]+$'

    # Use re.sub to remove patterns at the beginning and the end of the word
    cleaned_word = re.sub(pattern, '', word)

    return cleaned_word

def remove_word_if_pattern_met_twice(word):
    pattern = r'[^\w\s]'

    # Find all matches of the pattern in the word
    matches = re.findall(pattern, word)

    # Check if the pattern is encountered at least 2 times
    if len(matches) >= 2:
        return False
    else:
        # If not, remove patterns from both ends of the word
        # cleaned_word = re.sub(r'^[^\w\s]+|[^\w\s]+$', '', word)
        return True

def replace_punctuation_with_space_and_remove_one_letter_word(word):
    replaced_word = re.sub(r'[^\w\s]',' ', word)
    replaced_word_arr = replaced_word.split()
    without_one_letter = list(filter(lambda word: len(word) > 1, replaced_word_arr))
    return " ".join(without_one_letter)

def is_word_in_stopwords(word, stopwords):
    return word not in stopwords.words('english')

def cleanData1(text):
    lower_text = text.lower()
    
    text_words = lower_text.split()
    
    no_punctuation = list(map(remove_patterns_from_ends, text_words))
    text_words_without_punctuation_pattern_met_twice_in_the_middle_of_the_word = list(filter(remove_word_if_pattern_met_twice, no_punctuation))
    text_words_without_punctuation = list(map(replace_punctuation_with_space_and_remove_one_letter_word, text_words_without_punctuation_pattern_met_twice_in_the_middle_of_the_word))

    no_numbered_words = list(filter(lambda word: not any(char.isdigit() for char in word), text_words_without_punctuation))
    
    string_words_without_punctuation = " ".join(no_numbered_words)
    
    string_no_multiple_whitespaces = re.sub(' +', ' ', string_words_without_punctuation).strip()
    
    return string_no_multiple_whitespaces
 
def formatFastTextClassifier(label):
    return "__label__" + str(label) + " "

In [81]:
df['title'] = df['title'].apply(cleanData1)
df['description'] = df['description'].apply(cleanData1)

In [82]:
df = df.dropna(how='any')

In [83]:
df = df.reset_index(drop=True)

In [84]:
df.to_csv(preprocessed_pretrained_data, index=False)