In [1]:
import pandas as pd
import sys

sys.path.append("..")

from sklearn.model_selection import train_test_split
from src.utils import project_root



In [7]:
def split_dataset(train_path: str, test_size: int) -> pd.Series:
    """Splits training dataset for local use"""
    og_train = pd.read_csv(project_root() + train_path, header=None, names=["label", "node_id"])
    og_train = og_train[["node_id", "label"]]
    x_train, x_test, y_train, y_test = train_test_split(
        og_train["node_id"], og_train["label"], test_size=test_size, random_state=2
    )
    return x_train, x_test, y_train, y_test

In [2]:
x_train, x_test, y_train, y_test = split_dataset("/data/train.csv", test_size=0.1)

NameError: name 'split_dataset' is not defined

In [3]:
def make_sets(train_path: str, text_path: str, nodeid2paper_path: str, test_path: str, test_size: float) -> pd.DataFrame:
    """Makes a training, local test set and test set"""
    assert test_size < 1, "Test size must be smaller than 1"
    # reading files
    train_df = pd.read_csv(project_root() + train_path, header=None, names=["label", "node_id"])
    train_df = train_df[["node_id", "label"]]
    nodeid2paperid = pd.read_csv(project_root() + nodeid2paper_path)
    nodeid2paperid.rename(columns={"node idx": "node_id", "paper id": "paper_id"}, inplace=True)
    text_df = pd.read_csv(project_root() + text_path, header=None, names=["paper_id", "title", "abstract"])
    test_df = pd.read_csv(project_root() + test_path, header=None, names=["node_id"])
    # merge paper id
    train_df = train_df.merge(nodeid2paperid, on="node_id", how="left")
    test_df = test_df.merge(nodeid2paperid, on="node_id", how="left")
    # splitting training and testing for local use
    training_df = train_df.iloc[:int(len(train_df) - (len(train_df) * test_size)), :]
    testing_df = train_df.iloc[int(len(train_df) - (len(train_df) * test_size)):, :]
    # training and test set
    training_set = text_df.merge(training_df, how="inner", on="paper_id")
    training_set = training_set[["node_id", "title", "abstract", "label"]]
    local_test_set = text_df.merge(testing_df, how="inner", on="paper_id")
    local_test_set = local_test_set[["node_id", "title", "abstract", "label"]]
    test_set = text_df.merge(test_df, how="inner", on="paper_id")
    test_set = test_set[["node_id", "title", "abstract"]]
    return training_set, local_test_set, test_set

In [4]:
training_set, local_test_set, test_set = make_sets("/data/train.csv", "/data/text.csv", "/data/nodeid2paperid.csv", "/data/test.csv", 0.1)


In [9]:
training_set.label.nunique()

20

In [5]:
from scipy import sparse
sp  = sparse.rand(54000, 10000)
import pandas as pd
import numpy as np
print(training_set.shape)
print(sp.shape)
# np.hstack((training_set, sp))

(54000, 4)
(54000, 10000)


In [15]:
pd.concat([pd.DataFrame.sparse.from_spmatrix(sp), training_set], axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9994,9995,9996,9997,9998,9999,node_id,title,abstract,label
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,104447,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...,6
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,15858,multi view metric learning for multi view vide...,Traditional methods on video summarization are...,16
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,107156,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...,5
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,82077,cryptographic hardening of d sequences,This paper shows how a one-way mapping using m...,4
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,42436,gesture based continuous authentication for we...,We study the feasibility of touch gesture beha...,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53995,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.377827,0.0,0.0,0.0,0.0,82076,co optimizing performance and memory footprint...,"Cutting-edge embedded system applications, suc...",5
53996,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,2630,fusionlane multi sensor fusion for lane markin...,It is a crucial step to achieve effective sema...,16
53997,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,29101,heatnet bridging the day night domain gap in s...,The majority of learning-based semantic segmen...,16
53998,0.0,0.0,0.682094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,47784,vulnerabilities of connectionist ai applicatio...,This article deals with the IT security of con...,4


In [6]:
training_set

Unnamed: 0,node_id,title,abstract,label
0,104447,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...,6
1,15858,multi view metric learning for multi view vide...,Traditional methods on video summarization are...,16
2,107156,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...,5
3,82077,cryptographic hardening of d sequences,This paper shows how a one-way mapping using m...,4
4,42436,gesture based continuous authentication for we...,We study the feasibility of touch gesture beha...,4
...,...,...,...,...
53995,82076,co optimizing performance and memory footprint...,"Cutting-edge embedded system applications, suc...",5
53996,2630,fusionlane multi sensor fusion for lane markin...,It is a crucial step to achieve effective sema...,16
53997,29101,heatnet bridging the day night domain gap in s...,The majority of learning-based semantic segmen...,16
53998,47784,vulnerabilities of connectionist ai applicatio...,This article deals with the IT security of con...,4


In [18]:
title = training_set.title.tolist()
abstract = training_set.abstract.tolist()
concatenated = [*title, *abstract]


In [32]:
concatenated[-2:]

['This article deals with the IT security of connectionist artificial intelligence (AI) applications, focusing on threats to integrity, one of the three IT security goals. Such threats are for instance most relevant in prominent AI computer vision applications. In order to present a holistic view on the IT security goal integrity, many additional aspects such as interpretability, robustness and documentation are taken into account. A comprehensive list of threats and possible mitigations is presented by reviewing the state-of-the-art literature. AI-specific vulnerabilities such as adversarial attacks and poisoning attacks as well as their AI-specific root causes are discussed in detail. Additionally and in contrast to former reviews, the whole AI supply chain is analysed with respect to vulnerabilities, including the planning, data acquisition, training, evaluation and operation phases. The discussion of mitigations is likewise not restricted to the level of the AI system itself but ra

In [25]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")

[nltk_data] Downloading package stopwords to /home/marc/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [37]:
for sentence in concatenated[-2:]:
    sentence = tokenizer.tokenize(sentence.lower())
    voc = [word for word in sentence if word not in stop_words]
    
    



In [47]:
import nltk
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm


def create_vocabulary(training_set: pd.DataFrame):
    stop_words = set(stopwords.words('english'))
    tokenizer =  nltk.tokenize.RegexpTokenizer(r"\w+")
    title = training_set.title.tolist()
    abstract = training_set.abstract.tolist()
    voc = []
    # concatenate both lists
    concat = [*title, *abstract]
    for sentence in tqdm(concat):
        # split sentence using tokenizer
        sentence = tokenizer.tokenize(sentence.lower())
        # removing stop words
        for word in sentence:
            if word not in stop_words:
                voc.append(word)
    # get unique words
    voc = np.unique(voc)
    return voc

In [48]:
voc = create_vocabulary(training_set)

100%|██████████| 108000/108000 [00:10<00:00, 9872.58it/s]


In [46]:
len(voc)

array(['according', 'approach', 'appropriately', 'art', 'caricature',
       'caricatures', 'cavi', 'characters', 'conduct', 'conventional',
       'cross', 'datasets', 'decade', 'demonstrate', 'distortions', 'due',
       'dynamic', 'easy', 'effectiveness', 'enables', 'exaggerating',
       'experimental', 'extreme', 'face', 'facial', 'far', 'features',
       'fixed', 'hard', 'heterogeneous', 'images', 'importance',
       'instead', 'introduced', 'largely', 'learn', 'learning', 'made',
       'method', 'methods', 'modal', 'modalities', 'multi', 'non',
       'paper', 'performance', 'performances', 'problem', 'progress',
       'propose', 'proposed', 'proposes', 'rather', 'realistic', 'recent',
       'recognition', 'result', 'results', 'rigid', 'show', 'significant',
       'state', 'strengthen', 'stuck', 'studied', 'superiority', 'task',
       'tasks', 'train', 'training', 'unlike', 'via', 'visual',
       'webcaricature', 'weights', 'well', 'work'], dtype='<U13')

In [1]:
import gensim.downloader as api
from gensim.modes import KeyedVectors
path_word2vec = api.load("word2vec-google-news-300", return_path=True)



In [3]:
# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format(path_word2vec, binary=True)

In [19]:
import pandas as pd

training_set.title

Unnamed: 0,title,abstract
0,spreadsheets on the move an evaluation of mobi...,The power of mobile devices has increased dram...
1,multi view metric learning for multi view vide...,Traditional methods on video summarization are...
2,big data analytics in future internet of things,Current research on Internet of Things (IoT) m...
3,cryptographic hardening of d sequences,This paper shows how a one-way mapping using m...
4,gesture based continuous authentication for we...,We study the feasibility of touch gesture beha...
...,...,...
53995,co optimizing performance and memory footprint...,"Cutting-edge embedded system applications, suc..."
53996,fusionlane multi sensor fusion for lane markin...,It is a crucial step to achieve effective sema...
53997,heatnet bridging the day night domain gap in s...,The majority of learning-based semantic segmen...
53998,vulnerabilities of connectionist ai applicatio...,This article deals with the IT security of con...


In [11]:
next(iter(dataloader))

KeyError: 0