# Baseline Recommender

# Importing libraries

In [77]:
import pandas as pd
import surprise
from surprise import Reader, Dataset, SVD
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, ShuffleSplit

## Data Importing

In [2]:
df = pd.read_csv("df_assay_entries.csv")
df.head()

Unnamed: 0,aid,cid,smiles,activity
0,891,3232584,CCNC1=NC=C2C(=N1)N(C(=O)C(=N2)C3=CC=CC(=C3)C#N...,active
1,891,3232585,COC1=CC=C(C=C1)OC2=NC=C3C(=N2)N(C(=O)C(=N3)C4=...,inactive
2,891,3232587,COC1=CC=CC=C1C2=NC3=CC=CC=C3C(=N2)NCC4=CC=CC=C4,active
3,891,3232589,C1CN(CCC12CCN(CC2)C(=O)OC3=CC=CC=C3)C4=CC=CC=C4,inactive
4,891,3232590,COCCN1C2=NC(=NC=C2N=C(C1=O)C3=CC=CC(=C3)C#N)N4...,inactive


In [3]:
# check size of imported data
len(df)

41620091

In [40]:
test = df['cid'].to_numpy()

## Data Converting

In [104]:
def data_transform_split(data_mode:int, split_mode:int=0, path:str="df_assay_entries.csv"):
    '''
    A function that turns the pandas data into test and trainset data in which the mode determines which type of train test splitting is done.
    Parameters
    ----------
    data_mode : int
        defines if the desired output is a surprise data package (0) or the torch_geometric data (1)
    path : str (optional)
        path and filename of the csv containing the chemistry dataset
    split_mode : int (optional)
        determines which split mode is used: 0=random split entries, 1=moleculewise, 2=assaywise
    
    Returns in case of data_mode=0
    ------------------------------
    trainset : surprise Trainset class
        Contains the data to train on
    testset : list of tuples with format (aid, cid, rating)
        Contains the data to test the Recomender algorithm on
    '''
    # assert split_mode is within accepted range
    assert split_mode>=0
    assert split_mode<=2
    # assert data_mode is within accepted range
    assert data_mode>=0
    assert data_mode<=1
    # import data
    df = pd.read_csv(path)
    # define empty split variable for differing split types of groupwise and randomwise splitting
    split = None
    #separation of split methods
    if split_mode==0:
        splitparam = df['cid'].to_numpy()
        split = ShuffleSplit(n_splits=1, random_state=0, test_size=0.2, train_size=None).split(splitparam, None)
    else:
        splitparam = None
        # mode 1 or 2 decides wheter the split will be with cid or aid
        if split_mode==1:
            splitparam = df['cid'].to_numpy()
        else:
            splitparam = df['aid'].to_numpy()
        # get the split test and train set as ids in numpy arrays
        split = GroupShuffleSplit(n_splits=1, random_state=0, test_size=0.2, train_size=None).split(splitparam, None, groups=splitparam)
    # unpack split index arrays from generator class in split
    test_ind = None
    train_ind = None
    for i,j in split:
        train_ind = i
        test_ind = j
    # now we have the indexes of the split data. Left to do is use this and create the data package of choice 
    if data_mode==0:
        #data mode of surprise package
        # here we need to remodel the column activity to 0 and 1 boolean entries
        df['rating']=df['activity'].map(lambda x: int(x=='active'))
        # define reader to convert pandas dataframe to surprise package
        reader = Reader(rating_scale=(0,1))
        # convert dataset importing only the entries from trainset index list using the iloc function
        trainset = Dataset.load_from_df(df.iloc[train_ind][['aid', 'cid', 'rating']], reader).build_full_trainset()
        testset = Dataset.load_from_df(df.iloc[test_ind][['aid', 'cid', 'rating']], reader).build_full_trainset().build_testset()
        return trainset, testset
    else:
        # build GNN edge set
        return None

In [105]:
test, train = data_transform_split(0)

In [108]:
test.n_ratings

33296072

In [112]:
count = 0
for i in train:
    if int(i[2])==1:
        count += 1
count

122449