diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index dfe0770..0000000
--- a/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-# Auto detect text files and perform LF normalization
-* text=auto
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/Dataset Descriptions.docx b/Dataset Descriptions.docx
new file mode 100644
index 0000000..2d8a8dc
Binary files /dev/null and b/Dataset Descriptions.docx differ
diff --git a/config.cf b/config.cf
new file mode 100644
index 0000000..8a9a989
--- /dev/null
+++ b/config.cf
@@ -0,0 +1,25 @@
+[database]
+# dbuser:   Username
+# dbpass:   Password
+# dbhost:   Host name or IP
+# dbport:   Port
+# dbname:   Database name
+
+dbuser = postgres
+dbpass = root
+dbhost = localhost
+dbport = 5432
+dbname = prueba_work
+
+
+
+
+[data]
+# dir_data:     Directory to save results
+# ncpu:         Number or cores for parallel processing.
+#               Set to 0 to deactivate parallel processing
+# chunksize:    Size of chunks for paper processing and database ingestion
+
+dir_data = ../SemanticScholar/
+ncpu = 4
+chunksize = 100000
diff --git a/dbManager/S2manager.py b/dbManager/S2manager.py
new file mode 100644
index 0000000..b4e135f
--- /dev/null
+++ b/dbManager/S2manager.py
@@ -0,0 +1,765 @@
+"""
+Datamanager for importing Semantic Scholar papers
+into a Postgres database
+
+Jul 2021
+
+@authors: Jerónimo Arenas García (jeronimo.arenas@uc3m.es)
+          José Antonio Espinosa Melchor (joespino@pa.uc3m.es)
+
+
+"""
+
+import gzip
+import json
+import re
+from functools import partial
+from multiprocessing import Pool
+
+import numpy as np
+import pandas as pd
+import requests
+from sqlalchemy import create_engine, sql
+from tqdm import tqdm
+
+try:
+    # UCS-4
+    regex = re.compile("[\U00010000-\U0010ffff]")
+except re.error:
+    # UCS-2
+    regex = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
+
+"""
+Some functions need to be defined outside the class for allowing 
+parallel processing of the Semantic Scholar files. It is necessary
+to do so to make pickle serialization work
+"""
+
+
+def ElementInList(source_list, search_string):
+    if search_string in source_list:
+        return True
+    else:
+        return False
+
+
+def normalize(data):
+    """ Remove spaces and empty data """
+    data = data.strip()
+    if len(data) > 0:
+        return data
+    return None
+
+
+def get_gzfiles(dir_data):
+    """ Get files  """
+    return sorted([el for el in dir_data.iterdir() if el.name.startswith("s2-corpus")])
+
+
+def read_papers_infile(gz_file):
+    """ Load papers information in file """
+    try:
+        # Read json and separate papers
+        with gzip.open(gz_file, "rt", encoding="utf8") as f:
+            papers_infile = f.read().replace("}\n{", "},{")
+    except:
+        print(f"Error with file {gz_file}")
+        return []
+    papers_infile = json.loads("[" + papers_infile + "]")
+    return papers_infile
+
+
+def process_paper(paperEntry):
+    """
+    This function takes a dictionary with paper information as input
+    and returns a list to insert in S2papers
+    """
+    try:
+        year = (int(paperEntry["year"]),)
+    except:
+        year = 9999
+
+    try:
+        magid = int(paperEntry["magid"])
+    except:
+        magid = np.nan
+    try:
+        pmid = int(paperEntry["pmid"])
+    except:
+        pmid = np.nan
+
+    paper_list = [
+        paperEntry["id"],
+        regex.sub(" ", paperEntry["title"]),
+        regex.sub(" ", paperEntry["title"].lower()),
+        regex.sub(" ", paperEntry["paperAbstract"]),
+        regex.sub(" ", paperEntry["s2Url"]),
+        "\t".join(paperEntry["pdfUrls"]),
+        year,
+        ElementInList(paperEntry["sources"], "DBLP"),
+        ElementInList(paperEntry["sources"], "Medline"),
+        paperEntry["doi"],
+        paperEntry["doiUrl"],
+        pmid,
+        magid,
+    ]
+
+    return paper_list
+
+
+def process_paperFile(gzf):
+    """
+    Process Semantic Scholar gzip file, and extract a list of
+    journals, a list of venues, a list of fields of study, and a
+    list wih paper information to save in the S2papers table
+
+    Parameters
+    ----------
+    gzf: String
+        Name of the file to process
+
+    Returns
+    -------
+    A list containing 4 lists: papers in file, unique journals in file,
+    unique venues in file, unique fields in file
+    """
+
+    # Get papers in file
+    papers_infile = read_papers_infile(gzf)
+
+    # Extract venues getting rid of repetitions
+    thisfile_venues = set([normalize(paper["venue"]) for paper in papers_infile])
+    thisfile_venues.discard(None)
+
+    # Extract journals getting rid of repetitions
+    thisfile_journals = set(
+        [normalize(paper["journalName"]) for paper in papers_infile]
+    )
+    thisfile_journals.discard(None)
+
+    # Extract all fields, and flatten before getting rid of repetitions
+    # Flatenning is necessary because each paper has a list of fields
+    thisfile_fields = set(
+        [normalize(item) for paper in papers_infile for item in paper["fieldsOfStudy"]]
+    )
+    thisfile_fields.discard(None)
+
+    # Extract fields for the S2papers table
+    thisfile_papers = [process_paper(el) for el in papers_infile]
+
+    return [thisfile_papers, thisfile_venues, thisfile_journals, thisfile_fields]
+
+
+def process_Citations(gzf):
+    """
+    This function takes a zfile with paper information as input
+    and returns a list ready to insert in table
+    """
+
+    # Get papers in file
+    papers_infile = read_papers_infile(gzf)
+
+    # Process each paper
+    cite_list = []
+    for paperEntry in papers_infile:
+        if len(paperEntry["outCitations"]):
+            for el in paperEntry["outCitations"]:
+                cite_list.append([paperEntry["id"], el])
+    return cite_list
+
+
+def process_Authors(gzf):
+    """
+    This function takes a zfile with paper information as input
+    and returns a list of all authors in the file ready to insert in table
+    """
+
+    # Get papers in file
+    papers_infile = read_papers_infile(gzf)
+
+    # Process each paper
+    thisfile_authors = []
+    for paperEntry in papers_infile:
+        if len(paperEntry["authors"]):
+            for author in paperEntry["authors"]:
+                auth_id = author["ids"]
+                auth_nm = normalize(author["name"])
+                if len(auth_id) and auth_nm is not None:
+                    thisfile_authors.append((int(auth_id[0]), auth_nm))
+    return thisfile_authors
+
+
+def process_Authorship(gzf):
+    """
+    This function takes a zfile with paper information as input
+    and returns a list ready to insert in paperAuthor (paper-authors information)
+    """
+
+    # Get papers in file
+    papers_infile = read_papers_infile(gzf)
+
+    # Get list of authors for each paper in file
+    lista_paper_author = []
+    for paper in papers_infile:
+        author_list = [
+            (paper["id"], int(el["ids"][0]))
+            for el in paper["authors"]
+            if len(el["ids"])
+        ]
+        lista_paper_author += author_list
+
+    return lista_paper_author
+
+
+def process_Fields(gzf, venues_dict, journals_dict, fields_dict, papers_set):
+    """
+    This function takes a zfile with paper information as input
+    and returns a list ready to insert in paperField, paperJournal and paperVenues
+    """
+
+    # Get papers in file
+    papers_infile = read_papers_infile(gzf)
+
+    papers_fields = []
+    papers_journals = []
+    papers_venues = []
+
+    for paper in papers_infile:
+        paper_id = paper["id"]
+        # If paper in database
+        if paper_id in papers_set:
+            # Fields
+            fields_list = []
+            for el in paper["fieldsOfStudy"]:
+                el = normalize(el)
+                try:
+                    fields_list.append(
+                        {"S2paperID": paper_id, "fieldID": fields_dict[el]}
+                    )
+                except:
+                    pass
+            papers_fields.extend(fields_list)
+
+            # Journal
+            journal = normalize(paper["journalName"])
+            journal_vol = normalize(paper["journalVolume"])
+            journal_pag = normalize(paper["journalPages"])
+            try:
+                papers_journals.append(
+                    {
+                        "S2paperID": paper_id,
+                        "journalID": journals_dict[journal],
+                        "journalVolume": journal_vol,
+                        "journalPages": journal_pag,
+                    }
+                )
+            except:
+                pass
+
+            # Venue
+            venue = normalize(paper["venue"])
+            try:
+                papers_venues.append(
+                    {"S2paperID": paper_id, "venueID": venues_dict[venue]}
+                )
+            except:
+                pass
+
+    return papers_fields, papers_journals, papers_venues
+
+
+def get_sources(paper, stype="references"):
+    """ Use the SemanticScholar API to obtain the requested information.
+        Reference/Citation fields:
+            -intents
+            -isInfluential
+            -paperId
+
+        Parameters
+        ----------
+        paper: String
+            Semantic Scholar unique identifier.
+        stype: String
+            {"references", "citations"}. Default: "references"
+            
+    """
+
+    # Initialize return
+    df_reference = pd.DataFrame(
+        columns=[
+            "S2paperID1",
+            "S2paperID2",
+            "isInfluential",
+            "BackgrIntent",
+            "MethodIntent",
+            "ResultIntent",
+        ]
+    )
+
+    # Query configuration
+    offset = 0
+    limit = 1000
+    next = True
+
+    ref_list = []
+
+    while next:
+        # Request reference
+        query = f"https://api.semanticscholar.org/graph/v1/paper/{paper}/{stype}?offset={offset}&limit={limit}&fields=intents,isInfluential,paperId"
+        resp = requests.get(url=query)
+        data = resp.json()
+
+        # Keep searching if there is `next` value
+        try:
+            offset = data["next"]
+        except:
+            next = False
+
+        # Get references
+        try:
+            aux = pd.DataFrame.from_dict(data["data"])
+
+            # Get reference paper ID
+            if stype == "references":
+                aux["S2paperID1"] = paper
+                aux["S2paperID2"] = aux["citedPaper"].apply(
+                    lambda x: x.get("paperId", np.nan)
+                )
+            else:
+                aux["S2paperID1"] = aux["citingPaper"].apply(
+                    lambda x: x.get("paperId", np.nan)
+                )
+                aux["S2paperID2"] = paper
+
+            # Get intents
+            def split_intents(row):
+                intents = {
+                    "background": [False],
+                    "methodology": [False],
+                    "result": [False],
+                }
+                [intents.update({el: [True]}) for el in row]
+                return pd.DataFrame.from_dict(intents)
+
+            aux[["BackgrIntent", "MethodIntent", "ResultIntent"]] = pd.concat(
+                aux["intents"].apply(split_intents).values.tolist()
+            ).reset_index(drop=True)
+
+            aux = aux[
+                [
+                    "S2paperID1",
+                    "S2paperID2",
+                    "isInfluential",
+                    "BackgrIntent",
+                    "MethodIntent",
+                    "ResultIntent",
+                ]
+            ]
+
+            ref_list.append(aux)
+
+        except Exception as e:
+            print(e)
+            pass
+
+    if ref_list:
+        return df_reference.append(pd.concat(ref_list), ignore_index=True)
+    return df_reference
+
+
+class S2manager:
+    def __init__(self, dbuser, dbpass, dbhost, dbport, dbname):
+
+        # Database configuration
+        self.dbuser = dbuser
+        self.dbpass = dbpass
+        self.dbhost = dbhost
+        self.dbport = dbport
+        self.dbname = dbname
+        self.engine = create_engine(
+            f"postgresql://{dbuser}:{dbpass}@{dbhost}:{dbport}/{dbname}"
+        )
+
+    def create_database(self, file):
+        """ Create database from file """
+
+        with self.engine.connect() as con:
+            file = open(file)
+            query = sql.text(file.read()).execution_options(autocommit=True)
+
+            con.execute(query)
+
+    def drop_database(self):
+        """ Remove all tables """
+
+        with self.engine.connect() as con:
+            query = sql.text(
+                "DROP SCHEMA public CASCADE; CREATE SCHEMA public;"
+            ).execution_options(autocommit=True)
+
+            con.execute(query)
+
+    def read_table_set(self, table, col):
+        """ Read a table column and obtain all its unique values """
+
+        df = pd.read_sql_table(table, self.engine, columns=[col])
+        values = set(df[col].tolist())
+        return values
+
+    def importPapers(self, dir_data, ncpu, chunksize=100000):
+        """
+        Import data from Semantic Scholar compressed data files
+        available at the indicated location
+        Paper data, venues, journals and fields will be imported.
+        """
+
+        print("Filling in table S2papers")
+
+        # Get data files
+        gz_files = get_gzfiles(dir_data)
+
+        # Read tables to avoid repeated values
+        papers_set = self.read_table_set("S2papers", "S2paperID")
+        venues_set = self.read_table_set("S2venues", "venueName")
+        journs_set = self.read_table_set("S2journals", "journalName")
+        fields_set = self.read_table_set("S2fields", "fieldName")
+
+        # Aux function that will insert data into each table
+        # Defined here because it's the same wheter multiple cpus are used or not
+        def populate(file_data):
+            """ Aux function to insert data into database """
+            (
+                thisfile_papers,
+                thisfile_venues,
+                thisfile_journals,
+                thisfile_fields,
+            ) = file_data
+
+            # S2papers
+            columns = [
+                "S2paperID",
+                "title",
+                "lowertitle",
+                "paperAbstract",
+                "s2Url",
+                "pdfUrls",
+                "year",
+                "isDBLP",
+                "isMEDLINE",
+                "doi",
+                "doiUrl",
+                "pmid",
+                "magId",
+            ]
+            df = pd.DataFrame(thisfile_papers, columns=columns)
+            set_new_data = set(df["S2paperID"])
+            df = df[df["S2paperID"].isin(papers_set ^ set_new_data)]
+            df.to_sql("S2papers", self.engine, if_exists="append", index=False)
+            papers_set.update(set_new_data)
+
+            # S2venues
+            columns = "venueName"
+            df = pd.DataFrame(thisfile_venues, columns=[columns])
+            set_new_data = set(df[columns])
+            df = df[df[columns].isin(venues_set ^ set(df[columns]))]
+            df.to_sql("S2venues", self.engine, if_exists="append", index=False)
+            venues_set.update(set_new_data)
+
+            # S2journals
+            columns = "journalName"
+            df = pd.DataFrame(thisfile_journals, columns=[columns])
+            set_new_data = set(df[columns])
+            df = df[df[columns].isin(journs_set ^ set(df[columns]))]
+            df.to_sql("S2journals", self.engine, if_exists="append", index=False)
+            journs_set.update(set_new_data)
+
+            # S2fields
+            columns = "fieldName"
+            df = pd.DataFrame(thisfile_fields, columns=[columns])
+            set_new_data = set(df[columns])
+            df = df[df[columns].isin(fields_set ^ set(df[columns]))]
+            df.to_sql("S2fields", self.engine, if_exists="append", index=False)
+            fields_set.update(set_new_data)
+
+        if ncpu:
+            # Parallel processing
+            with tqdm(total=len(gz_files), leave=None) as pbar:
+                with Pool(ncpu) as p:
+                    for file_data in p.imap(process_paperFile, gz_files):
+                        # Populate tables with the new data
+                        populate(file_data)
+                        pbar.update()
+
+        else:
+            with tqdm(total=len(gz_files), leave=None) as pbar:
+                for gzf in gz_files:
+                    file_data = process_paperFile(gzf)
+                    # Populate tables with the new data
+                    populate(file_data)
+                    pbar.update()
+
+    def importSources(self, ncpu, stype="references", chunksize=100000):
+        """ Imports References/Citation information """
+
+        print("Obtaining S2paperIDs")
+
+        # Read tables to avoid repeated values
+        papers_set = self.read_table_set("S2papers", "S2paperID")
+
+        def chunks(l, n):
+            """Yields successive n-sized chunks from list l."""
+            for i in range(0, len(l), n):
+                yield l[i : i + n]
+
+        def populate(papers_references):
+            """ Aux function to insert data into database """
+
+            # Concat all references
+            df = pd.concat(papers_references)
+
+            # # Remove papers not present in database
+            df = df[df["S2paperID2"].isin(papers_set)]
+
+            # Introduce new data
+            df.to_sql("citations", self.engine, if_exists="append", index=False)
+
+        ch_size = 100  # Number of papers to process at a time
+        remaining = len(papers_set)
+
+        with tqdm(total=np.ceil(len(papers_set) / ch_size), leave=None) as chunk_bar:
+            chunk_bar.set_description("Processing papers chunks")
+            for chk in chunks(list(papers_set), ch_size):
+                papers_references = []
+                if ncpu:
+                    # Parallel processing
+                    with tqdm(total=min(ch_size, remaining), leave=None) as pbar:
+                        with Pool(ncpu) as p:
+                            for df_references in p.imap(
+                                partial(get_sources, stype=stype,), chk
+                            ):
+                                papers_references.append(df_references)
+                                pbar.update()
+                else:
+                    with tqdm(total=min(ch_size, remaining), leave=None) as pbar:
+                        for paper in chk:
+                            papers_references.append(get_sources(paper, stype=stype))
+                            pbar.update()
+
+                populate(papers_references)
+                remaining = remaining - ch_size
+                chunk_bar.update()
+
+    def importCitations(self, dir_data, ncpu, chunksize=100000):
+        """ Imports Citation information """
+
+        print("Obtaining S2paperIDs")
+
+        # Read tables to avoid repeated values
+        papers_set = self.read_table_set("S2papers", "S2paperID")
+
+        # Get data files
+        gz_files = get_gzfiles(dir_data)
+
+        def populate(cite_list):
+            """ Aux function to insert data into database """
+            # Ensure all papers exist in database
+            aux_list = [
+                (c0, c1)
+                for c0, c1 in cite_list
+                if c0 in papers_set and c1 in papers_set
+            ]
+
+            columns = ["S2paperID1", "S2paperID2"]
+            citations_df = pd.DataFrame(aux_list, columns=columns)
+
+            # Introduce new data
+            citations_df.to_sql(
+                "citations", self.engine, if_exists="append", index=False
+            )
+
+        if ncpu:
+            # Parallel processing
+            with tqdm(total=len(gz_files), leave=None) as pbar:
+                with Pool(ncpu) as p:
+                    for cite_list in p.imap(process_Citations, gz_files):
+                        populate(cite_list)
+                        pbar.update()
+
+        else:
+            with tqdm(total=len(gz_files), leave=None) as pbar:
+                for gzf in gz_files:
+                    cite_list = process_Citations(gzf)
+                    populate(cite_list)
+                    pbar.update()
+
+    def importFields(self, dir_data, ncpu, chunksize=100000):
+        """ Imports Fields, Journals and Volumes of Study associated to each paper """
+
+        # We extract venues, journals and fields as dictionaries
+        # to name-id
+        print("Obtaining venues, journals and fields dictionaries")
+        venues_dict = pd.read_sql_table(
+            "S2venues", self.engine, columns=["venueName", "venueID"]
+        )
+        venues_dict = dict(venues_dict.values.tolist())
+
+        journals_dict = pd.read_sql_table(
+            "S2journals", self.engine, columns=["journalName", "journalID"]
+        )
+        journals_dict = dict(journals_dict.values.tolist())
+
+        fields_dict = pd.read_sql_table(
+            "S2fields", self.engine, columns=["fieldName", "fieldID"]
+        )
+        fields_dict = dict(fields_dict.values.tolist())
+
+        print("Obtaining S2paperIDs")
+        papers_set = self.read_table_set("S2papers", "S2paperID")
+
+        # Get data files
+        gz_files = get_gzfiles(dir_data)
+
+        def populate(all_data):
+            """ Aux function to insert data into database """
+            papers_fields, papers_journals, papers_venues = all_data
+
+            # Introduce new data
+            # FIELDS
+            df = pd.DataFrame(papers_fields)
+            df.to_sql("paperField", self.engine, if_exists="append", index=False)
+            # VENUES
+            df = pd.DataFrame(papers_venues)
+            df.to_sql("paperVenue", self.engine, if_exists="append", index=False)
+            # JOURNALS
+            df = pd.DataFrame(papers_journals)
+            df.to_sql("paperJournal", self.engine, if_exists="append", index=False)
+
+        print("Filling in venue, journal and field of study data...")
+        if ncpu:
+            # Parallel processing
+            with tqdm(total=len(gz_files), leave=None) as pbar:
+                with Pool(ncpu) as p:
+                    for all_data in p.imap(
+                        partial(
+                            process_Fields,
+                            venues_dict=venues_dict,
+                            journals_dict=journals_dict,
+                            fields_dict=fields_dict,
+                            papers_set=papers_set,
+                        ),
+                        gz_files,
+                    ):
+                        populate(all_data)
+                        pbar.update()
+        else:
+            with tqdm(total=len(gz_files), leave=None) as pbar:
+                for gzf in gz_files:
+                    all_data = process_Fields(
+                        gzf, venues_dict, journals_dict, fields_dict, papers_set
+                    )
+                    populate(all_data)
+                    pbar.update()
+
+    def importAuthorsData(self, dir_data, ncpu):
+        """ Imports authors' information """
+
+        print("Filling authors information")
+        # Get data files
+        gz_files = get_gzfiles(dir_data)
+
+        # Read tables to avoid repeated values
+        authors_set = self.read_table_set("S2authors", "S2authorID")
+
+        def chunks(l, n):
+            """Yields successive n-sized chunks from list l."""
+            for i in range(0, len(l), n):
+                yield l[i : i + n]
+
+        columns = ["S2authorID", "name"]
+        ch_size = 100  # Number of files to process at a time
+        remaining = len(gz_files)
+
+        with tqdm(total=np.ceil(len(gz_files) / ch_size), leave=None) as chunk_bar:
+            chunk_bar.set_description("Processing file chunks")
+            for gz_chunk in chunks(gz_files, ch_size):
+                author_counts = []
+                if ncpu:
+                    # Parallel processing
+                    with tqdm(total=min(ch_size, remaining), leave=None) as pbar:
+                        with Pool(ncpu) as p:
+                            for thisfile_authors in p.imap(process_Authors, gz_chunk):
+                                author_counts += thisfile_authors
+                                pbar.update()
+                else:
+                    with tqdm(total=min(ch_size, remaining), leave=None) as pbar:
+                        for gzf in gz_chunk:
+                            author_counts += process_Authors(gzf)
+                            pbar.update()
+                remaining = remaining - ch_size
+
+                # We need to get rid of duplicated ids
+                # If an ID is repeated, keep the longest name
+                author_counts = set(author_counts)
+                author_counts = [
+                    author for author in author_counts if author[0] not in authors_set
+                ]
+                df = pd.DataFrame(author_counts, columns=columns)
+                df["length"] = df["name"].str.len()
+                df.sort_values("length", ascending=False, inplace=True)
+                df.drop_duplicates(subset="S2authorID", inplace=True)
+
+                if len(df):
+                    # Populate tables with the new data
+                    df[columns].to_sql(
+                        "S2authors", self.engine, if_exists="append", index=False
+                    )
+                    authors_set.update(df["S2authorID"].values)
+
+                chunk_bar.update()
+
+    def importAuthors(self, dir_data, ncpu):
+        """ Imports Authorship information (paper-author data) """
+
+        print("Processing paper-authors information")
+
+        # Get data files
+        gz_files = get_gzfiles(dir_data)
+
+        # Get all papers and authors IDs present in database
+        print("Obtaining all papers and authors IDs")
+        papers_set = self.read_table_set("S2papers", "S2paperID")
+        authors_set = self.read_table_set("S2authors", "S2authorID")
+
+        def populate(lista_paper_author):
+            """ Aux function to insert data into database """
+
+            # Ensure all papers and authors exist in database
+            aux_list = [
+                (c0, c1)
+                for c0, c1 in lista_paper_author
+                if c0 in papers_set and c1 in authors_set
+            ]
+
+            columns = ["S2paperID", "S2authorID"]
+            df = pd.DataFrame(aux_list, columns=columns)
+
+            # Introduce new data
+            df.to_sql("paperAuthor", self.engine, if_exists="append", index=False)
+            authors_set.update(df["S2authorID"].values)
+
+        if ncpu:
+            # Parallel processing
+            with tqdm(total=len(gz_files), leave=None) as pbar:
+                with Pool(ncpu) as p:
+                    for lista_paper_author in p.imap(process_Authorship, gz_files):
+                        # Populate tables with the new data
+                        populate(lista_paper_author)
+                        pbar.update()
+
+        else:
+            with tqdm(total=len(gz_files), leave=None) as pbar:
+                for gzf in gz_files:
+                    lista_paper_author = process_Authorship(gzf)
+                    # Populate tables with the new data
+                    populate(lista_paper_author)
+                    pbar.update()
diff --git a/dbManager/create_sql.sql b/dbManager/create_sql.sql
new file mode 100644
index 0000000..8c3906c
--- /dev/null
+++ b/dbManager/create_sql.sql
@@ -0,0 +1,150 @@
+-- CREATE TABLE "S2papers"(
+--     "paperID" SERIAL,
+--     "S2paperID" CHAR(40) PRIMARY KEY, --[str]
+--     "title" TEXT COLLATE "default", --[str]
+--     "lowertitle" TEXT COLLATE "default",
+--     "paperAbstract" TEXT COLLATE "default", --[str]
+--     -- "entities" TEXT COLLATE "default", -- DEPRECATED --[list]
+--     "s2Url" TEXT COLLATE "default", --[str]
+--     "pdfUrls" TEXT COLLATE "default", --[list]
+--     -- "s2PdfUrl" TEXT COLLATE "default", -- DEPRECATED --[str]
+--     -- "authors" , -- DIFFERENT TABLE --[list]
+--     -- "inCitations" , -- DIFFERENT TABLE --[list]
+--     -- "outCitations" , -- DIFFERENT TABLE --[list]
+--     -- "fieldsOfStudy" , -- DIFFERENT TABLE --[list]
+--     "year" SMALLINT, --[int]
+--     -- "venue" , -- DIFFERENT TABLE --[str]
+--     -- "journalName" , -- DIFFERENT TABLE --[str]
+--     -- "journalVolume" , -- DIFFERENT TABLE --[str]
+--     -- "journalPages" , -- DIFFERENT TABLE --[str]
+--     "isDBLP" TEXT COLLATE "default", -- ("sources" field) --[list]
+--     "doi" VARCHAR(128) COLLATE "default", --[str]
+--     "doiUrl" VARCHAR(256) COLLATE "default", --[str]
+--     "pmid" VARCHAR(16) COLLATE "default", --[str]
+--     "magId" BIGINT --[str]
+-- );
+-- CREATE TABLE "S2papers"(
+--     "paperID" SERIAL,
+--     "S2paperID" CHAR(40) PRIMARY KEY,
+--     "title" TEXT,
+--     "lowertitle" TEXT,
+--     "paperAbstract" TEXT,
+--     "entities" TEXT,
+--     "fieldsOfStudy" TEXT,
+--     "s2PdfUrl" VARCHAR(77),
+--     "pdfUrls" TEXT,
+--     "year" SMALLINT,
+--     "journalVolume" VARCHAR(300),
+--     "journalPages" VARCHAR(100),
+--     "isDBLP" BOOLEAN,
+--     "isMedline" BOOLEAN,
+--     "doi" VARCHAR(128),
+--     "doiUrl" VARCHAR(256),
+--     "pmid" VARCHAR(16),
+--     "magid" BIGINT
+--     -- "ESP_contri" BOOLEAN,
+--     -- "AIselection" BOOLEAN,
+--     -- "langid" VARCHAR(3)
+-- ) ;
+CREATE TABLE "S2papers"(
+    "paperID" SERIAL,
+    "S2paperID" CHAR(40) PRIMARY KEY, --[str]
+    "title" TEXT COLLATE "default", --[str]
+    "lowertitle" TEXT COLLATE "default",
+    "paperAbstract" TEXT COLLATE "default", --[str]
+    "s2Url" TEXT COLLATE "default", --[str]
+    "pdfUrls" TEXT COLLATE "default", --[list]
+    "year" SMALLINT, --[int]
+    "isDBLP" BOOLEAN, -- ("sources" field) --[list]
+    "isMEDLINE" BOOLEAN, -- ("sources" field) --[list]
+    "doi" VARCHAR(128) COLLATE "default", --[str]
+    "doiUrl" VARCHAR(256) COLLATE "default", --[str]
+    "pmid" INT, --[str]
+    "magId" BIGINT --[str]
+);
+
+CREATE TABLE "paperLemas"(
+    "S2paperID" CHAR(40) COLLATE "default",
+    "lemas" TEXT COLLATE "default",
+    FOREIGN KEY ("S2paperID") REFERENCES "S2papers" ("S2paperID") ON DELETE CASCADE
+);
+
+CREATE TABLE "S2authors"(
+    "authorID" SERIAL,
+    "S2authorID" INT PRIMARY KEY,
+    "orcidID" VARCHAR(20) COLLATE "default",
+    "orcidGivenName" VARCHAR(40) COLLATE "default",
+    "orcidFamilyName" VARCHAR(100) COLLATE "default",
+    "scopusID" BIGINT,
+    "name" VARCHAR(256) COLLATE "default",
+    "influentialCitationCount" SMALLINT,
+    "ESP_affiliation" BOOLEAN
+);
+
+CREATE TABLE "paperAuthor"(
+    "paperAuthorID" SERIAL PRIMARY KEY,
+    "S2paperID" CHAR(40) COLLATE "default",
+    "S2authorID" INT,
+    FOREIGN KEY ("S2paperID") REFERENCES "S2papers" ("S2paperID") ON DELETE CASCADE,
+    FOREIGN KEY ("S2authorID") REFERENCES "S2authors" ("S2authorID") ON DELETE CASCADE
+);
+
+CREATE TABLE "S2fields"(
+    "fieldID" SERIAL PRIMARY KEY,
+    "fieldName" VARCHAR(32) COLLATE "default"
+);
+
+CREATE TABLE "paperField"(
+    "paperFieldID" SERIAL PRIMARY KEY,
+    "S2paperID" CHAR(40) COLLATE "default",
+    "fieldID" INT,
+    FOREIGN KEY ("S2paperID") REFERENCES "S2papers" ("S2paperID") ON DELETE CASCADE,
+    FOREIGN KEY ("fieldID") REFERENCES "S2fields" ("fieldID") ON DELETE CASCADE
+);
+
+CREATE TABLE "S2venues"(
+    "venueID" SERIAL PRIMARY KEY,
+    "venueName" VARCHAR(320) COLLATE "default"
+);
+
+CREATE TABLE "paperVenue"(
+    "paperVenueID" SERIAL PRIMARY KEY,
+    "S2paperID" CHAR(40) COLLATE "default",
+    "venueID" INT,
+    FOREIGN KEY ("S2paperID") REFERENCES "S2papers" ("S2paperID") ON DELETE CASCADE,
+    FOREIGN KEY ("venueID") REFERENCES "S2venues" ("venueID") ON DELETE CASCADE
+);
+
+CREATE TABLE "S2journals"(
+    "journalID" SERIAL PRIMARY KEY,
+    "journalName" VARCHAR(320) COLLATE "default"
+);
+
+CREATE TABLE "paperJournal"(
+    "paperJournalID" SERIAL PRIMARY KEY,
+    "S2paperID" CHAR(40) COLLATE "default",
+    "journalID" INT,
+    "journalVolume" VARCHAR(300) COLLATE "default",
+    "journalPages" VARCHAR(100) COLLATE "default",
+    FOREIGN KEY ("S2paperID") REFERENCES "S2papers" ("S2paperID") ON DELETE CASCADE,
+    FOREIGN KEY ("journalID") REFERENCES "S2journals" ("journalID") ON DELETE CASCADE
+);
+
+CREATE TABLE "citations"(
+    "citationID" SERIAL PRIMARY KEY,
+    "S2paperID1" CHAR(40) COLLATE "default",
+    "S2paperID2" CHAR(40) COLLATE "default",
+    "isInfluential" BOOLEAN,
+    "BackgrIntent" BOOLEAN,
+    "MethodIntent" BOOLEAN,
+    "ResultIntent" BOOLEAN,
+    FOREIGN KEY ("S2paperID1") REFERENCES "S2papers" ("S2paperID") ON DELETE CASCADE,
+    FOREIGN KEY ("S2paperID2") REFERENCES "S2papers" ("S2paperID") ON DELETE CASCADE
+);
+
+CREATE INDEX S2idPaper ON "S2papers" ("S2paperID");
+CREATE INDEX S2idAuth ON "S2authors" ("S2authorID");
+CREATE INDEX paper1 ON "citations" ("S2paperID1");
+CREATE INDEX paper2 ON "citations" ("S2paperID2");
+CREATE INDEX paper ON "paperAuthor" ("S2paperID");
+CREATE INDEX author ON "paperAuthor" ("S2authorID");
\ No newline at end of file
diff --git a/importSS.py b/importSS.py
new file mode 100644
index 0000000..2aed5a7
--- /dev/null
+++ b/importSS.py
@@ -0,0 +1,125 @@
+####################################################
+# Imports
+
+import argparse
+import time
+from configparser import ConfigParser
+from pathlib import Path
+
+from dbManager.S2manager import S2manager
+
+
+def main():
+    """
+    """
+
+    ####################################################
+    # Read connection parameters
+
+    cf = ConfigParser()
+    cf.read("config.cf")
+
+    dbuser = cf.get("database", "dbuser")
+    dbpass = cf.get("database", "dbpass")
+    dbhost = cf.get("database", "dbhost")
+    dbport = cf.get("database", "dbport")
+    dbname = cf.get("database", "dbname")
+    dbncpu = int(cf.get("data", "ncpu"))
+    dbchunksize = int(cf.get("data", "chunksize"))
+
+    #########################
+    # Datafiles
+
+    dir_data = Path(cf.get("data", "dir_data"))
+
+    ####################################################
+    # Database connection
+
+    DB = S2manager(
+        dbname=dbname, dbhost=dbhost, dbuser=dbuser, dbpass=dbpass, dbport=dbport
+    )
+
+    ####################################################
+    # 1. If activated, display console
+    # print(interface)
+    # if interface:
+    while True:
+        print("\nSelect option:")
+        print("1. Reset database")
+        print("2. Import papers from data files")
+        print("3. Import authors from data files")
+        print("4. Import citations from data files")
+        print("5. Import fields, journals and volumes of study data from data files")
+        print("6. Import authorship from data files")
+
+        print("0. Quit")
+        selection = input()
+
+        if selection == "1":
+            print("Previous info will be deleted. Continue?\n[y]/[n]")
+            selection = input()
+            if selection == "y":
+                print("Regenerating the database. Existing data will be removed.")
+                # The following method deletes all existing tables, and create them
+                # again without data.
+                DB.drop_database()
+                DB.create_database("dbManager/create_sql.sql")
+
+        elif selection == "2":
+            print("Importing papers data ...")
+            t0 = time.time()
+            DB.importPapers(dir_data, dbncpu, dbchunksize)
+            print(f"\nTotal time: {time.time()-t0}")
+
+        elif selection == "3":
+            print("Importing authors data ...")
+            t0 = time.time()
+            DB.importAuthorsData(dir_data, dbncpu)
+            print(f"\nTotal time: {time.time()-t0}")
+
+        elif selection == "4":
+            # 4. If activated, citations data
+            # will be imported from S2 data files
+            print("Importing citations data ...")
+            # DB.importCitations(dir_data, dbncpu, dbchunksize)
+            DB.importSources(dbncpu, stype="references")
+
+        elif selection == "5":
+            # 5. If activated, journals, volumes, and Fields of Study data
+            # will be imported from S2 data files
+            print("Importing fields, journals and venues of study data ...")
+            DB.importFields(dir_data, dbncpu, dbchunksize)
+
+        elif selection == "6":
+            # 6. If activated, authorship data
+            # will be imported from S2 data files
+            print("Importing authorship data ...")
+            DB.importAuthors(dir_data, dbncpu, dbchunksize)
+
+        elif selection == "0":
+            return
+
+        else:
+            print("Invalid option")
+
+
+if __name__ == "__main__":
+
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument("--dbuser", action=, help="")
+    # parser.add_argument("--dbpass", action=, help="")
+    # parser.add_argument("--dbhost", action=, help="")
+    # parser.add_argument("--dbport", action=, help="")
+    # parser.add_argument("--dbname", action=, help="")
+    # parser.add_argument("--dir_data", action=, help="")
+    # parser.add_argument("--ncpu", action=, help="")
+    # parser.add_argument("--chunksize", action=, help="")
+    # parser.add_argument(
+    #     "--interface", action="store_true", help="",
+    # )
+
+    # args = parser.parse_args()
+
+    # main(interface=args.interface)
+
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..da28abf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+SQLAlchemy==1.4.7
+tqdm==4.61.2
+requests==2.25.1
+numpy==1.20.1
+pandas==1.2.4