# Description


First EDA for catbase database. This phase is run before Data_Processing.

-   Analyze tables
-   Remove uninformative tables


# Start


In [4]:
import json

import pandas as pd
import psycopg2
from sqlalchemy import create_engine, inspect

## Definitons


In [5]:
def create_engine_from_config():
    with open("../../config/config.json", "r") as file:
        config = json.load(file)

    rel_host = config["relational_database"]["host"]
    rel_port = config["relational_database"]["port"]
    rel_db_name = config["relational_database"]["dbname"]
    rel_user = config["relational_database"]["user"]
    rel_password = config["relational_database"]["password"]

    return create_engine(f"postgresql+psycopg2://{rel_user}:{rel_password}@{rel_host}:{rel_port}/{rel_db_name}")

In [6]:
engine = create_engine_from_config()

# EDA - First look at tables


## First look at tables


In [7]:
def get_all_tables(engine):
    inspector = inspect(engine)
    tables = pd.DataFrame(inspector.get_table_names(), columns=["table_name"])
    return tables


tables = get_all_tables(engine)
tables

Unnamed: 0,table_name
0,adonis_schema
1,adonis_schema_versions
2,cats
3,cat_informations
4,cat_references
5,breeds
6,links
7,user_breeds
8,cat_histories
9,roles


We can immediately remove non-informative tables:

-   **adonis_schema**
-   **adonis_schema_versions**
-   **api_tokens**


In [8]:
def remove_non_informative_tables(tables):
    non_informative_tables = ["adonis_schema", "adonis_schema_versions", "api_tokens"]
    return tables[~tables["table_name"].isin(non_informative_tables)].reset_index(drop=True)


tables = remove_non_informative_tables(tables)
tables

Unnamed: 0,table_name
0,cats
1,cat_informations
2,cat_references
3,breeds
4,links
5,user_breeds
6,cat_histories
7,roles
8,users


## Removing other non-informative tables


In [9]:
cats_df = pd.read_sql_query("SELECT * FROM cats LIMIT 10000", engine)
cat_informations_df = pd.read_sql_query("SELECT * FROM cat_informations LIMIT 10000", engine)
cat_references_df = pd.read_sql_query("SELECT * FROM cat_references LIMIT 10000", engine)
breeds_df = pd.read_sql_query("SELECT * FROM breeds LIMIT 10000", engine)
links_df = pd.read_sql_query("SELECT * FROM links LIMIT 10000", engine)
user_breeds_df = pd.read_sql_query("SELECT * FROM user_breeds LIMIT 10000", engine)
cat_histories_df = pd.read_sql_query("SELECT * FROM cat_histories LIMIT 10000", engine)
roles_df = pd.read_sql_query("SELECT * FROM roles LIMIT 10000", engine)
users_df = pd.read_sql_query("SELECT * FROM users LIMIT 10000", engine)

In [10]:
cats_df.head(5)

Unnamed: 0,id,name,country_origin,country_current,color,color_code,date_of_birth,gender,reg_num_origin,reg_num_current,src_db,src_id,breed_id,created_at,updated_at,deleted_at
0,1,Aiva Sama,FI,FI,ruskeatabby/valkea,n 09 22,2010-11-14,F,,FI SRK LO 85933,Finland,1,12.0,2024-01-27 08:54:55.844265+00:00,2024-01-27 08:54:55.844265+00:00,
1,2,Legsby,FI,FI,ruskeanaamio,n,2010-11-26,M,,FI SRK LO 38165,Finland,2,45.0,2024-01-27 08:54:55.844265+00:00,2024-01-27 08:54:55.844265+00:00,
2,3,Ultimaatumi,FI,FI,suklaanaamio,b,2010-11-20,M,,FI SRK LO 38166,Finland,3,45.0,2024-01-27 08:54:55.844265+00:00,2024-01-27 08:54:55.844265+00:00,
3,4,Ukaasi,FI,FI,ruskeanaamio,n,2010-11-20,M,,FI SRK LO 38167,Finland,4,45.0,2024-01-27 08:54:55.844265+00:00,2024-01-27 08:54:55.844265+00:00,
4,5,Ukulele,FI,FI,ruskeanaamio,n,2010-11-20,M,,FI SRK LO 38168,Finland,5,45.0,2024-01-27 08:54:55.844265+00:00,2024-01-27 08:54:55.844265+00:00,


In [11]:
cat_informations_df.head(5)

Unnamed: 0,id,title_before,title_after,chip,verified_status,cattery,cat_id
0,1,,,,,FI*Hömpötin,1
1,2,,,,,FI*Twist-Tiramisun,2
2,3,,,985154000110482.0,,FI*Kelmikerhon,3
3,4,,,,,FI*Kelmikerhon,4
4,5,,,981098102723783.0,,FI*Kelmikerhon,5


In [12]:
cat_references_df.head(5)

Unnamed: 0,id,cat_id,father_id,mother_id,father_name,mother_name,father_reg_number,mother_reg_number
0,1,1,23276.0,32482.0,Torero,Britney,FIN SRK LO 82451,FIN SRK LO 82200
1,2,2,39075.0,31819.0,Kauko Kiito,Ilosofi,FI SRK LO 36077,FIN SRK LO 34506
2,3,3,39075.0,39061.0,Kauko Kiito,Unikonsiemen,FI SRK LO 36077,FI SRK LO 36119
3,4,4,39075.0,39061.0,Kauko Kiito,Unikonsiemen,FI SRK LO 36077,FI SRK LO 36119
4,5,5,39075.0,39061.0,Kauko Kiito,Unikonsiemen,FI SRK LO 36077,FI SRK LO 36119


In [13]:
breeds_df.head(5)

Unnamed: 0,id,code
0,1,EXO
1,2,PER
2,3,RAG
3,4,SBI
4,5,TUV


In [14]:
links_df.head(5)

Unnamed: 0,id,content,type,cat_id,created_at,updated_at
0,1,Selbstbewusst,NOTE,297070,2024-01-27 08:55:29.582031+00:00,2024-01-27 08:55:29.582031+00:00
1,2,Internationale Katzenausstellung Bassum 02.07....,AWARD,297070,2024-01-27 08:55:29.582031+00:00,2024-01-27 08:55:29.582031+00:00
2,3,Superschmusiger tollpatschiger Clown,NOTE,297071,2024-01-27 08:55:29.582031+00:00,2024-01-27 08:55:29.582031+00:00
3,4,Internationale Katzenausstellung Tostedt 02.03...,AWARD,297071,2024-01-27 08:55:29.582031+00:00,2024-01-27 08:55:29.582031+00:00
4,5,verspielte Kuschel(Frauchen)katze,NOTE,297072,2024-01-27 08:55:29.582031+00:00,2024-01-27 08:55:29.582031+00:00


In [15]:
user_breeds_df.head(5)

Unnamed: 0,user_id,breed_id


In [16]:
cat_histories_df.head(5)

Unnamed: 0,id,current,updated,cat_id,created_at


In [17]:
roles_df.head(5)

Unnamed: 0,id,name
0,1,USER
1,2,ADMIN
2,3,SUPERADMIN


In [18]:
users_df.head(5)

Unnamed: 0,id,fullname,email,password,remember_me_token,role_id,created_at,updated_at,verified,email_date,code
0,1,Monika Kováčová,monika.kovacova@stuba.sk,"$argon2id$v=19$t=3,m=4096,p=1$DHyBy+9NLsK/HiKJ...",,3,2024-01-27 08:53:58.693000+00:00,2024-01-27 08:53:58.693000+00:00,True,,


We can next remove other non-informative tables:

-   **user_breeds** (no data)
-   **cat_histories** (no data)
-   **roles** (not needed for our analysis)
-   **users** (not needed for our analysis)


In [19]:
def remove_non_informative_tables(tables):
    non_informative_tables = ["user_breeds", "cat_histories", "roles", "users", "links"]
    return tables[~tables["table_name"].isin(non_informative_tables)].reset_index(drop=True)


tables = remove_non_informative_tables(tables)
tables

Unnamed: 0,table_name
0,cats
1,cat_informations
2,cat_references
3,breeds


## Further Analysis of remaining tables


**cats**

-   Holds general information about cats

**cat_informations**

-   Hold extra information about cats

**cat_references**

-   Hold references to other cats (e.g. parents)

**breeds**

-   List of breeds
