# Baseline Recommender

# Importing libraries

In [156]:
import pandas as pd
import surprise
from surprise import Reader, Dataset, SVD
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, ShuffleSplit
import torch
from torch_geometric.data import Data

## Data Importing

In [2]:
df = pd.read_csv("df_assay_entries.csv")
df.head()

Unnamed: 0,aid,cid,smiles,activity
0,891,3232584,CCNC1=NC=C2C(=N1)N(C(=O)C(=N2)C3=CC=CC(=C3)C#N...,active
1,891,3232585,COC1=CC=C(C=C1)OC2=NC=C3C(=N2)N(C(=O)C(=N3)C4=...,inactive
2,891,3232587,COC1=CC=CC=C1C2=NC3=CC=CC=C3C(=N2)NCC4=CC=CC=C4,active
3,891,3232589,C1CN(CCC12CCN(CC2)C(=O)OC3=CC=CC=C3)C4=CC=CC=C4,inactive
4,891,3232590,COCCN1C2=NC(=NC=C2N=C(C1=O)C3=CC=CC(=C3)C#N)N4...,inactive


In [3]:
# check size of imported data
len(df)

41620091

In [40]:
test = df['cid'].to_numpy()

## Data Converting

In [166]:
def data_transform_split(data_mode:int, split_mode:int=0, path:str="df_assay_entries.csv"):
    '''
    A function that turns the pandas data into test and trainset data in which the mode determines which type of train test splitting is done.
    Parameters
    ----------
    data_mode : int
        defines if the desired output is a surprise data package (0) or the torch_geometric data (1)
    path : str (optional)
        path and filename of the csv containing the chemistry dataset
    split_mode : int (optional)
        determines which split mode is used: 0=random split entries, 1=moleculewise, 2=assaywise
    
    Returns in case of data_mode=0
    ------------------------------
    trainset : surprise Trainset class
        Contains the data to train on
    testset : list of tuples with format (aid, cid, rating)
        Contains the data to test the Recomender algorithm on
    '''
    # assert split_mode is within accepted range
    assert split_mode>=0
    assert split_mode<=2
    # assert data_mode is within accepted range
    assert data_mode>=0
    assert data_mode<=1
    # import data
    df = pd.read_csv(path)
    # define empty split variable for differing split types of groupwise and randomwise splitting
    split = None
    #separation of split methods
    if split_mode==0:
        splitparam = df['cid'].to_numpy()
        split = ShuffleSplit(n_splits=1, random_state=0, test_size=0.2, train_size=None).split(splitparam, None)
    else:
        splitparam = None
        # mode 1 or 2 decides wheter the split will be with cid or aid
        if split_mode==1:
            splitparam = df['cid'].to_numpy()
        else:
            splitparam = df['aid'].to_numpy()
        # get the split test and train set as ids in numpy arrays
        split = GroupShuffleSplit(n_splits=1, random_state=0, test_size=0.2, train_size=None).split(splitparam, None, groups=splitparam)
    # unpack split index arrays from generator class in split
    test_ind = None
    train_ind = None
    for i,j in split:
        train_ind = i
        test_ind = j
    # now we have the indexes of the split data. Left to do is use this and create the data package of choice 
    if data_mode==0:
        #data mode of surprise package
        # here we need to remodel the column activity to 0 and 1 boolean entries
        df['rating']=df['activity'].map(lambda x: int(x=='active'))
        # define reader to convert pandas dataframe to surprise package
        reader = Reader(rating_scale=(0,1))
        # convert dataset importing only the entries from trainset index list using the iloc function
        trainset = Dataset.load_from_df(df.iloc[train_ind][['aid', 'cid', 'rating']], reader).build_full_trainset()
        testset = Dataset.load_from_df(df.iloc[test_ind][['aid', 'cid', 'rating']], reader).build_full_trainset().build_testset()
        return trainset, testset
    else:
        # build GNN edge set
        # we need to unify the cid and aid to one id set so that the ids for pytorch geometric are unique
        # ID TRANSLATION PART
        # count the number of aid's
        aid_count = np.unique(df['aid'].to_numpy()).shape[0]
        # count the number of cid's
        cid_count = np.unique(df['cid'].to_numpy()).shape[0]
        # create aid translation dictionary
        a = np.sort(np.unique(df['aid'].to_numpy()))
        a_n = np.arange(aid_count)
        aid_translation_dictionary = {a[i]:a_n[i] for i in range(aid_count)}
        # create cid translation dictionary
        c = np.sort(np.unique(df['cid'].to_numpy()))
        c_n = np.arange(aid_count, cid_count)
        cid_translation_dictionary = {c[i]:c_n[i] for i in range(cid_count)}
        # PROCESSING PART
        # the nodes in the graph are all the ids we have from aid and cid
        # the edges are the connections between aid and cid which are ACTIVE, inactive edges do not exist in this model
        # filter dataset to only active relations with df[df.activity=='active']
        # space needed for the edges in array. *2 because we need to have 2 edges per entry signaling an undirected edge
        active_count = df[df.activity=='active'].size*2
        # initialize the edges array
        edges = np.zeros(shape=(2,active_count))
        # create marker for current position in array
        marker = 0
        # iterate over the rows and enter the edges in the array
        for index, row in df[df.activity=='active'].iterrows():
            # find mapped cid
            mcid = cid_translation_dictionary[row.cid]
            # find mapped aid
            maid = aid_translation_dictionary[row.aid]
            # input one directed edge
            edges[0, marker]=mcid
            edges[1, marker]=maid
            marker += 1
            # input other directed edge
            edges[0, marker]=maid
            edges[1, marker]=mcid
            marker += 1
        # transform edges to torch object
        edges = torch.tensor(edges, dtype=torch.long)
        # TODO 
        # do here something with SMILES and rdkit
        # create pytorch geometric Data object
        data = Data(edge_index=edges)
        # TODO
        # test and training mask/pos edge, neg edge...
        # create Dataset
        return None

In [105]:
test, train = data_transform_split(0)

In [108]:
test.n_ratings

33296072

In [115]:
count = False
for i in train:
    if not count and int(i[0])==1:
        count=True
count

False

In [125]:
n = df['aid'].to_numpy()

In [128]:
n1 = np.sort(np.unique(n))

In [129]:
n1.shape

(2481,)

In [131]:
c

array([        4,         6,        11, ..., 146170511, 146170512,
       146170513], dtype=int64)

In [132]:
c.shape

(455079,)

In [134]:
aid_count = np.unique(df['aid'].to_numpy()).shape[0]
aid_count

2481

In [None]:
cid_count

In [135]:
df[df.activity=='active']

Unnamed: 0,aid,cid,smiles,activity,rating
0,891,3232584,CCNC1=NC=C2C(=N1)N(C(=O)C(=N2)C3=CC=CC(=C3)C#N...,active,1
2,891,3232587,COC1=CC=CC=C1C2=NC3=CC=CC=C3C(=N2)NCC4=CC=CC=C4,active,1
7,891,3232595,C1OC2=C(O1)C=C(C=C2)C3=CN=CN=C3NCC4=CC=CC=C4,active,1
9,891,3232600,COC1=CC2=C(C=C1)NC=C2CCNC3=NC(=NC=C3)C4=COC=C4,active,1
11,891,3232604,CC1=C(C(=NO1)C)C2=CC3=C(C=C2)N=CN=C3NCCC4=CNC5...,active,1
...,...,...,...,...,...
41620085,1479148,636397,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...,active,1
41620087,1479148,4724,CC(C)(C)NCC(COC1=CC=CC=C1C2CCCC2)O,active,1
41620088,1479148,6708778,COC1=CC(=CC(=C1O)OC)[C@H]2[C@@H]3C(COC3=O)C(C4...,active,1
41620089,1479148,54728271,CC1=NN=C(O1)C(=O)NC(C)(C)C2=NC(=C(C(=O)N2C)[O-...,active,1


In [140]:
c = np.sort(np.unique(df['cid'].to_numpy()))
c_n = np.arange(c.shape[0])
cid_translation_dictionary = {c[i]:c_n[i] for i in range(c.shape[0])}

In [143]:
cid_translation_dictionary[7529]

2967

In [152]:
# space needed for the edges in array. *2 because we need to have 2 edges per entry signaling an undirected edge
active_count = df[df.activity=='active'].size*2
# initialize the edges array
edges = np.zeros(shape=(2,active_count))
# create marker for current position in array
marker = 0
# iterate over the rows and enter the edges in the array
for index, row in df[df.activity=='active'].iterrows():
    # find mapped cid
    mcid = cid_translation_dictionary[row.cid]
    # find mapped aid
    maid = aid_translation_dictionary[row.aid]
    # input one directed edge
    edges[0, marker]=mcid
    edges[1, marker]=maid
    marker += 1
    # input other directed edge
    edges[0, marker]=maid
    edges[1, marker]=mcid
    marker += 1
# transform edges to torch object
edges = torch.tensor(edges, dtype=torch.long)
# TODO 
# do here something with SMILES and rdkit
# create pytorch geometric Data object
data = Data(edge_index=edges)
# TODO
# test and training mask/pos edge, neg edge...

In [154]:
cid_translation_dictionary[p.cid]

185036

In [155]:
p.cid

3232584

In [157]:
edge_index = torch.tensor(np.array([[0, 1, 1, 2], [1, 0, 2, 1]]), dtype=torch.long)

In [158]:
edge_index

tensor([[0, 1, 1, 2],
        [1, 0, 2, 1]])

In [159]:
df[df.activity=='active'].size

3061285

In [163]:
edges = np.zeros(shape=(2,5))

In [164]:
edges[0,3]=4

In [165]:
edges

array([[0., 0., 0., 4., 0.],
       [0., 0., 0., 0., 0.]])