# Description


First EDA for catbase database. This phase is run before Data_Processing.

-   Analyze tables
-   Remove uninformative tables


# Start


In [None]:
import json

import pandas as pd
import psycopg2
from sqlalchemy import create_engine, inspect

## Definitons


In [None]:
def create_engine_from_config():
    with open("../../util/config.json", "r") as file:
        config = json.load(file)

    rel_host = config["relational_database"]["host"]
    rel_port = config["relational_database"]["port"]
    rel_db_name = config["relational_database"]["dbname"]
    rel_user = config["relational_database"]["user"]
    rel_password = config["relational_database"]["password"]

    return create_engine(f"postgresql+psycopg2://{rel_user}:{rel_password}@{rel_host}:{rel_port}/{rel_db_name}")

In [None]:
engine = create_engine_from_config()

# EDA - First look at tables


## First look at tables


In [None]:
def get_all_tables(engine):
    inspector = inspect(engine)
    tables = pd.DataFrame(inspector.get_table_names(), columns=["table_name"])
    return tables


tables = get_all_tables(engine)
tables

We can immediately remove non-informative tables:

-   **adonis_schema**
-   **adonis_schema_versions**
-   **api_tokens**


In [None]:
def remove_non_informative_tables(tables):
    non_informative_tables = ["adonis_schema", "adonis_schema_versions", "api_tokens"]
    return tables[~tables["table_name"].isin(non_informative_tables)].reset_index(drop=True)


tables = remove_non_informative_tables(tables)
tables

## Removing other non-informative tables


In [None]:
cats_df = pd.read_sql_query("SELECT * FROM cats LIMIT 10000", engine)
cat_informations_df = pd.read_sql_query("SELECT * FROM cat_informations LIMIT 10000", engine)
cat_references_df = pd.read_sql_query("SELECT * FROM cat_references LIMIT 10000", engine)
breeds_df = pd.read_sql_query("SELECT * FROM breeds LIMIT 10000", engine)
links_df = pd.read_sql_query("SELECT * FROM links LIMIT 10000", engine)
user_breeds_df = pd.read_sql_query("SELECT * FROM user_breeds LIMIT 10000", engine)
cat_histories_df = pd.read_sql_query("SELECT * FROM cat_histories LIMIT 10000", engine)
roles_df = pd.read_sql_query("SELECT * FROM roles LIMIT 10000", engine)
users_df = pd.read_sql_query("SELECT * FROM users LIMIT 10000", engine)

In [None]:
cats_df.head(5)

In [None]:
cat_informations_df.head(5)

In [None]:
cat_references_df.head(5)

In [None]:
breeds_df.head(5)

In [None]:
links_df.head(5)

In [None]:
user_breeds_df.head(5)

In [None]:
cat_histories_df.head(5)

In [None]:
roles_df.head(5)

In [None]:
users_df.head(5)

We can next remove other non-informative tables:

-   **user_breeds** (no data)
-   **cat_histories** (no data)
-   **roles** (not needed for our analysis)
-   **users** (not needed for our analysis)


In [None]:
def remove_non_informative_tables(tables):
    non_informative_tables = ["user_breeds", "cat_histories", "roles", "users", "links"]
    return tables[~tables["table_name"].isin(non_informative_tables)].reset_index(drop=True)


tables = remove_non_informative_tables(tables)
tables

## Further Analysis of remaining tables


**cats**

-   Holds general information about cats

**cat_informations**

-   Hold extra information about cats

**cat_references**

-   Hold references to other cats (e.g. parents)

**breeds**

-   List of breeds
