In [None]:
import os
from pathlib import Path
import pandas as pd
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from keras.preprocessing.text import Tokenizer
from imblearn.under_sampling import EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class Resampling:

    def __init__(self, name):
        self.strategie = None
        self.name = name

        if name == "enn":
            self.strategie = EditedNearestNeighbours(sampling_strategy='auto',
                                                     n_neighbors=3,
                                                     kind_sel='all',
                                                     n_jobs=-1)
        elif name == "allknn":
            self.strategie = AllKNN(sampling_strategy='auto',
                                    n_neighbors=3,
                                    kind_sel='all',
                                    allow_minority=False,
                                    n_jobs=-1)
        elif name == "renn":
            self.strategie = RepeatedEditedNearestNeighbours(sampling_strategy='auto',
                                                             n_neighbors=3,
                                                             max_iter=100,
                                                             kind_sel='all',
                                                             n_jobs=-1)

        elif name == "tomek":
            self.strategie = TomekLinks(sampling_strategy='auto',
                                        n_jobs=-1)

        elif name == "smote":
            self.strategie = SMOTE(sampling_strategy='auto',
                                   k_neighbors=5,
                                   n_jobs=-1)

        elif name == "bdsmote":
            self.strategie = BorderlineSMOTE(n_jobs=-1)

        elif name == "adasyn":
            self.strategie = ADASYN(sampling_strategy='auto',
                                    n_neighbors=5,
                                    n_jobs=-1)

        elif name == "smoteenn":
            self.strategie = SMOTEENN(sampling_strategy='auto',
                                      smote=None,
                                      enn=None,
                                      n_jobs=-1)

        elif name == "smotetomek":
            self.strategie = SMOTETomek(sampling_strategy='auto',
                                        smote=None,
                                        tomek=None)

    def fit_resample(self, x, y):
        x_res, y_res = self.strategie.fit_resample(x, y)
        return x_res, y_res

In [None]:
class DataSet:

    def define_datasets(self, data, num_data):
        resamplings = ['origin', 'tomek', 'smote', 'bdsmote', 'adasyn', 'smotetomek']
        k_folds = 10
        X = data['RequirementText']
        y = data['Class']
        print(X)
        print(y)
        cv = StratifiedShuffleSplit(n_splits=k_folds, test_size=0.2)
        indices = cv.split(X, y)

        i = 1
        for train, test in indices:
            vetorClas = LabelEncoder().fit(y[train])
            y_train = vetorClas.transform(y[train])
            y_test = vetorClas.transform(y[test])

            # Tokenize and transform to integer index
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(X[train])

            x_train = tokenizer.texts_to_matrix(X[train], mode='tfidf')
            x_test = tokenizer.texts_to_matrix(X[test], mode='tfidf')

            self.export_data_test(num_data, i, x_test, y_test)

            for resampling in resamplings:
                self.export_data_train(resampling, num_data, i, x_train, y_train)
            i += 1

    def export_data_train(self, resampling, num_data, index, x_train, y_train):
        path1 = '/content/drive/MyDrive/PIBIC/datasets/'+'data_'+str(num_data)+'/train/'+resampling + '_class_train(' + str(index) + ')'
        path2 = '/content/drive/MyDrive/PIBIC/datasets/'+'data_'+str(num_data)+'/train/'+resampling + '_tfidf_train(' + str(index) + ')'
        if resampling != 'origin':
            x_train, y_train = Resampling(resampling).fit_resample(x_train, y_train)


        df = {'Class': y_train}
        df = pd.DataFrame(df)
        df.to_csv(path1 + '.csv')

        df = pd.DataFrame(x_train)
        df.to_csv(path2 + '.csv')

        self.update_dataset_detail(resampling, y_train, num_data, index)

    def export_data_test(self, num_data, index, x_test, y_test):
        path1 = '/content/drive/MyDrive/PIBIC/datasets/'+'data_'+str(num_data)+'/test/class_test(' + str(index) + ')'
        path2 = '/content/drive/MyDrive/PIBIC/datasets/'+'data_'+str(num_data)+'/test/tfidf_test(' + str(index) + ')'

        df = {'Class': y_test}
        df = pd.DataFrame(df)
        df.to_csv(path1+'.csv')

        df = pd.DataFrame(x_test)
        df.to_csv(path2 + '.csv')

        self.update_dataset_detail(None, y_test, num_data, index, test=True)

    def update_dataset_detail(self, resampling, y, num_data, index, test=False):
        path = '/content/drive/MyDrive/PIBIC/datasets/'+'data_'+str(num_data)+'/detail/dataset_detail(' + str(index) + ').csv'

        file = Path(path)

        classes = ['A', 'FT', 'L', 'LF', 'MN', 'O', 'PE', 'PO', 'SC', 'SE', 'US', 'total']
        counts = self.count_classes(y)

        if file.is_file():
            df = pd.read_csv(path)
            try:
                os.remove(file)
            except OSError as e:
                print(e)

            df[resampling] = counts
            df.to_csv(path, index=False)

        else:
            if test:
                df = pd.DataFrame({'classes': classes,
                                   'test': counts})
            else:
                df = pd.DataFrame({'classes': classes,
                                   resampling: counts})
            df.to_csv(path, index=False)

    def count_classes(self, y):
        y = pd.DataFrame(y)
        values = y.value_counts().to_dict()
        count = []

        for i in range(11):
            count.append(values[(i,)])
        total = sum(count)
        count.append(total)

        return count

In [None]:
if __name__ == '__main__':
    dataset = DataSet()
    warnings.filterwarnings('ignore')
    dataset.define_datasets(data=pd.read_csv('/content/drive/MyDrive/PIBIC/datasets/PROMISE_exp_preprocessed.csv', encoding='utf-8'), num_data=3)