In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install fasttext
!pip install lazypredict

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296181 sha256=36e8e8cebc1cf3e69bc620f4f2f43788e42b9d1821c4258ed1b8d96e479cdb73
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold
import fasttext
from lazypredict.Supervised import LazyClassifier
from transformers import BertTokenizer
import gensim
import gensim.downloader
from transformers import AlbertTokenizer

dataset = pd.read_excel("/content/drive/MyDrive/domain/Synthetic User Stories.xlsx")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
target = []
for row in dataset.iterrows():
    target.append(np.where(dataset["Domain"].unique() == row[1]["Domain"])[0][0])
dataset["Target"] = target
dataset["Target"]

Unnamed: 0,Target
0,0
1,1
2,0
3,1
4,0
...,...
12396,37
12397,36
12398,37
12399,36


In [None]:
def getTrainSetFastText():
    ft_model = fasttext.load_model("fasttext_model.bin")
    traindata = []
    for msg in dataset['User Story']:
        traindata.append(ft_model.get_sentence_vector(msg))
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetTFIDF():
    countvec = CountVectorizer(max_features=100)
    bow = countvec.fit_transform(dataset['User Story']).toarray()
    tfidfconverter = TfidfTransformer()
    X = tfidfconverter.fit_transform(bow).toarray()
    training_data = pd.DataFrame(X)
    training_data.columns = training_data.columns.astype(str)
    return training_data

def getTrainSetBERT():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized_data = tokenizer(dataset['User Story'].tolist(), padding=True, truncation=True, max_length=100)
    traindata = []
    for msg in tokenized_data['input_ids']:
        traindata.append(msg)
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetRoBERTa():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    tokenized_data = tokenizer(dataset['User Story'].tolist(), padding=True, truncation=True, max_length=100)
    traindata = []
    for msg in tokenized_data['input_ids']:
        traindata.append(msg)
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetALBERT():
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    tokenized_data = tokenizer(dataset['User Story'].tolist(), padding=True, truncation=True, max_length=100)
    traindata = []
    for msg in tokenized_data['input_ids']:
        traindata.append(msg)
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetDistilBERT():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    tokenized_data = tokenizer(dataset['User Story'].tolist(), padding=True, truncation=True, max_length=100)
    traindata = []
    for msg in tokenized_data['input_ids']:
        traindata.append(msg)
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetWord2Vec():
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('word2vec-google-news-300.bin', binary=True)
    traindata = []
    for msg in dataset['User Story']:
        words = msg.split()
        vecs = []
        for word in words:
            if word in w2v_model:
                vecs.append(w2v_model[word][:100])
        if vecs:
            vec_avg = sum(vecs) / len(vecs)
        else:
            vec_avg = [0] * 100
        traindata.append(vec_avg)

    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetGlove():
    glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')
    traindata = []
    for msg in dataset['User Story']:
        words = msg.split()
        vecs = []
        for word in words:
            if word in glove_vectors:
                vecs.append(glove_vectors[word])
        if vecs:
            vec_avg = sum(vecs) / len(vecs)
        else:
            vec_avg = [0] * 100
        traindata.append(vec_avg)

    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

In [None]:
result2 = pd.DataFrame(columns=["Fold","Model","Accuracy","F1-Score"], index=np.arange(300))
fold = KFold(n_splits=10, random_state=6666, shuffle=True)
X = getTrainSetALBERT() #Change this to get training set based on word embeddings method.
y = dataset['Target']
counter = 0
foldcounter = 1
for train_index, test_index in fold.split(X, y):
        print("Processing Fold "+ str(foldcounter) + " ...")
        X_train, X_test, y_train, y_test = \
            X[ X.index.isin(train_index)], X[ X.index.isin(test_index)], y[train_index], y[test_index]
        clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
        models,predictions = clf.fit(X_train, X_test, y_train, y_test)
        for model in models[:].iterrows():
            result2.loc[counter]["Fold"] = foldcounter
            result2.loc[counter]["Model"] = model[0]
            result2.loc[counter]["Accuracy"] = round(model[1][0],3)
            result2.loc[counter]["F1-Score"] = round(model[1][3],3)
            counter += 1
        foldcounter += 1
result2

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Processing Fold 1 ...


 97%|█████████▋| 30/31 [29:54<00:07,  7.98s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005437 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12398
[LightGBM] [Info] Number of data points in the train set: 11160, number of used features: 83
[LightGBM] [Info] Start training from score -3.514956
[LightGBM] [Info] Start training from score -3.514956
[LightGBM] [Info] Start training from score -3.517973
[LightGBM] [Info] Start training from score -3.500008
[LightGBM] [Info] Start training from score -3.536266
[LightGBM] [Info] Start training from score -3.511949
[LightGBM] [Info] Start training from score -3.530131
[LightGBM] [Info] Start training from score -3.502980
[LightGBM] [Info] Start training from score -3.524033
[LightGBM] [Info] Start training from score -3.517973
[LightGBM] [Info] Start training from score -3.517973
[LightGBM] [Info] Start training from score -3.50596

100%|██████████| 31/31 [30:08<00:00, 58.34s/it]


Processing Fold 2 ...


 97%|█████████▋| 30/31 [30:38<00:07,  7.76s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12364
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 83
[LightGBM] [Info] Start training from score -3.551860
[LightGBM] [Info] Start training from score -3.551860
[LightGBM] [Info] Start training from score -3.494181
[LightGBM] [Info] Start training from score -3.539437
[LightGBM] [Info] Start training from score -3.551860
[LightGBM] [Info] Start training from score -3.530221
[LightGBM] [Info] Start training from score -3.518062
[LightGBM] [Info] Start training from score -3.527167
[LightGBM] [Info] Start training from score -3.527167
[LightGBM] [Info] Start training from score -3.518062
[LightGBM] [Info] Start training from score -3.512038
[LightGBM] [Info] Start training from score -3.55812

100%|██████████| 31/31 [30:52<00:00, 59.76s/it]


Processing Fold 3 ...


 97%|█████████▋| 30/31 [32:20<00:07,  7.76s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004853 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12391
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 80
[LightGBM] [Info] Start training from score -3.515046
[LightGBM] [Info] Start training from score -3.533283
[LightGBM] [Info] Start training from score -3.491235
[LightGBM] [Info] Start training from score -3.518062
[LightGBM] [Info] Start training from score -3.515046
[LightGBM] [Info] Start training from score -3.536356
[LightGBM] [Info] Start training from score -3.548740
[LightGBM] [Info] Start training from score -3.512038
[LightGBM] [Info] Start training from score -3.524123
[LightGBM] [Info] Start training from score -3.551860
[LightGBM] [Info] Start training from score -3.536356
[LightGBM] [Info] Start training from score -3.51806

100%|██████████| 31/31 [32:34<00:00, 63.04s/it]


Processing Fold 4 ...


 97%|█████████▋| 30/31 [33:19<00:07,  7.99s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004877 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12359
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 82
[LightGBM] [Info] Start training from score -3.500098
[LightGBM] [Info] Start training from score -3.518062
[LightGBM] [Info] Start training from score -3.564439
[LightGBM] [Info] Start training from score -3.515046
[LightGBM] [Info] Start training from score -3.533283
[LightGBM] [Info] Start training from score -3.530221
[LightGBM] [Info] Start training from score -3.518062
[LightGBM] [Info] Start training from score -3.521088
[LightGBM] [Info] Start training from score -3.530221
[LightGBM] [Info] Start training from score -3.509040
[LightGBM] [Info] Start training from score -3.542529
[LightGBM] [Info] Start training from score -3.52716

100%|██████████| 31/31 [33:33<00:00, 64.95s/it]


Processing Fold 5 ...


 97%|█████████▋| 30/31 [33:04<00:07,  7.47s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12387
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 83
[LightGBM] [Info] Start training from score -3.521088
[LightGBM] [Info] Start training from score -3.527167
[LightGBM] [Info] Start training from score -3.515046
[LightGBM] [Info] Start training from score -3.542529
[LightGBM] [Info] Start training from score -3.533283
[LightGBM] [Info] Start training from score -3.515046
[LightGBM] [Info] Start training from score -3.506050
[LightGBM] [Info] Start training from score -3.530221
[LightGBM] [Info] Start training from score -3.583609
[LightGBM] [Info] Start training from score -3.524123
[LightGBM] [Info] Start training from score -3.512038
[LightGBM] [Info] Start training from score -3.50605

100%|██████████| 31/31 [33:18<00:00, 64.48s/it]


Processing Fold 6 ...


 97%|█████████▋| 30/31 [32:49<00:07,  7.74s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12418
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 83
[LightGBM] [Info] Start training from score -3.533283
[LightGBM] [Info] Start training from score -3.506050
[LightGBM] [Info] Start training from score -3.530221
[LightGBM] [Info] Start training from score -3.548740
[LightGBM] [Info] Start training from score -3.551860
[LightGBM] [Info] Start training from score -3.545629
[LightGBM] [Info] Start training from score -3.509040
[LightGBM] [Info] Start training from score -3.509040
[LightGBM] [Info] Start training from score -3.506050
[LightGBM] [Info] Start training from score -3.536356
[LightGBM] [Info] Start training from score -3.512038
[LightGBM] [Info] Start training from score -3.53635

100%|██████████| 31/31 [33:03<00:00, 64.00s/it]


Processing Fold 7 ...


 97%|█████████▋| 30/31 [32:26<00:07,  7.62s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004983 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12408
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 83
[LightGBM] [Info] Start training from score -3.545629
[LightGBM] [Info] Start training from score -3.518062
[LightGBM] [Info] Start training from score -3.524123
[LightGBM] [Info] Start training from score -3.542529
[LightGBM] [Info] Start training from score -3.512038
[LightGBM] [Info] Start training from score -3.542529
[LightGBM] [Info] Start training from score -3.503070
[LightGBM] [Info] Start training from score -3.536356
[LightGBM] [Info] Start training from score -3.527167
[LightGBM] [Info] Start training from score -3.536356
[LightGBM] [Info] Start training from score -3.521088
[LightGBM] [Info] Start training from score -3.50009

100%|██████████| 31/31 [32:40<00:00, 63.25s/it]


Processing Fold 8 ...


 97%|█████████▋| 30/31 [32:18<00:08,  8.29s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006118 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12403
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 83
[LightGBM] [Info] Start training from score -3.524123
[LightGBM] [Info] Start training from score -3.539437
[LightGBM] [Info] Start training from score -3.567608
[LightGBM] [Info] Start training from score -3.491235
[LightGBM] [Info] Start training from score -3.503070
[LightGBM] [Info] Start training from score -3.539437
[LightGBM] [Info] Start training from score -3.551860
[LightGBM] [Info] Start training from score -3.533283
[LightGBM] [Info] Start training from score -3.500098
[LightGBM] [Info] Start training from score -3.527167
[LightGBM] [Info] Start training from score -3.539437
[LightGBM] [Info] Start training from score -3.56127

100%|██████████| 31/31 [32:31<00:00, 62.95s/it]


Processing Fold 9 ...


 97%|█████████▋| 30/31 [31:47<00:07,  7.16s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005293 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12391
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 83
[LightGBM] [Info] Start training from score -3.545629
[LightGBM] [Info] Start training from score -3.542529
[LightGBM] [Info] Start training from score -3.506050
[LightGBM] [Info] Start training from score -3.524123
[LightGBM] [Info] Start training from score -3.521088
[LightGBM] [Info] Start training from score -3.524123
[LightGBM] [Info] Start training from score -3.518062
[LightGBM] [Info] Start training from score -3.518062
[LightGBM] [Info] Start training from score -3.545629
[LightGBM] [Info] Start training from score -3.533283
[LightGBM] [Info] Start training from score -3.551860
[LightGBM] [Info] Start training from score -3.52108

100%|██████████| 31/31 [32:02<00:00, 62.01s/it]


Processing Fold 10 ...


 97%|█████████▋| 30/31 [32:30<00:07,  7.59s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12448
[LightGBM] [Info] Number of data points in the train set: 11161, number of used features: 83
[LightGBM] [Info] Start training from score -3.506050
[LightGBM] [Info] Start training from score -3.506050
[LightGBM] [Info] Start training from score -3.521088
[LightGBM] [Info] Start training from score -3.536356
[LightGBM] [Info] Start training from score -3.527167
[LightGBM] [Info] Start training from score -3.536356
[LightGBM] [Info] Start training from score -3.554990
[LightGBM] [Info] Start training from score -3.567608
[LightGBM] [Info] Start training from score -3.491235
[LightGBM] [Info] Start training from score -3.503070
[LightGBM] [Info] Start training from score -3.539437
[LightGBM] [Info] Start training from score -3.52412

100%|██████████| 31/31 [32:45<00:00, 63.41s/it]


Unnamed: 0,Fold,Model,Accuracy,F1-Score
0,1,BaggingClassifier,0.99,0.99
1,1,XGBClassifier,0.98,0.98
2,1,DecisionTreeClassifier,0.98,0.98
3,1,ExtraTreesClassifier,0.92,0.92
4,1,RandomForestClassifier,0.86,0.85
...,...,...,...,...
295,,,,
296,,,,
297,,,,
298,,,,


In [None]:
result2.to_excel("albert_domain.xlsx")