# Importing the Schmidt et al. abundance database into postgreSQL

This program imports data from the Schmidt et al. 2015 database into postgreSQL database and merges it with the IntAct database.

Rijksuniversiteit Groningen, 2018

C.M. Punter (c.m.punter@rug.nl)

## Imports, connecting to postgreSQL and assigning variables

In [1]:
import psycopg2
import urllib.request
import os

In [2]:
# specify the username, password and the name of the database to connect to

username = 'postgres'
password = 'password'
database = 'intact_20201208'

# specify path to the database file

path = os.path.join(os.getcwd(), 'sabu.txt')

# connect to the database

conn = psycopg2.connect("dbname=%s user=%s password=%s" % (database, username, password))

# specify column names and data types

column_names = [
    'sabu_Uniprot',
    'sabu_Description',
    'sabu_Gene',
    'sabu_Peptides',
    'sabu_Confidence_score',
    'weight',
    'sabu_Dataset',
    'sabu_Glycerol_number_of_proteins_per_cell',
    'sabu_Glycerol_fg_protein_per_cell',
    'sabu_Glycerol_coeffcient_of_variance',
    'sabu_Bnumber',
    'sabu_Annotated_functional_COG_groups',
    'sabu_Annotated_functional_COG_group',
    'sabu_Annotated_functional_COG_class',
]

column_types = [
    'text unique',
    'text',
    'text',
    'text',
    'real',
    'real',
    'integer',
    'integer',
    'real',
    'real',
    'text',
    'text',
    'text',
    'text',
]

## Functions

In [3]:
# create SQL tables out of the imported file

def create_tables():
    cur = conn.cursor()
    name_type = ['%s %s' % (name, type) for name, type in zip(column_names, column_types)]
    cur.execute('create table if not exists abundance (' +
                'id serial primary key, ' +
                'identifier_id integer references identifiers on delete cascade, ' +
                ', '.join(name_type) + ')')
    conn.commit()


def get_identifier_id(identifier):
    cur = conn.cursor()
    cur.execute("select id from identifiers where identifier = %s", (identifier, ))
    identifier_id = cur.fetchone()
    cur.close()
    return identifier_id


def import_line(line):
    columns = line.split('\t')
    columns += [''] * (14 - len(columns))   # make sure we have 14 columns

    for i in range(14):
        if columns[i] == 'NA': columns[i] = None;

    identifier_id = get_identifier_id(columns[0])

    sql = 'insert into abundance (identifier_id, %s) values ' % ', '.join(column_names)
    sql += '(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on conflict do nothing'

    cur = conn.cursor()
    cur.execute(sql, (identifier_id, *columns))
    conn.commit()


def import_from_file(path):
    with open(path) as f:
        next(f) # skip header line
        for line in f:
            line = line.rstrip()  # remove \n at the end of the line
            import_line(line)

## Execution

In [4]:
create_tables()
import_from_file(path)