# Setup PostgreSQL with Docker Desktop

1. Access https://www.docker.com/ to download docker desktop and install  
"download docker desktop to install in windows"

2. Inside docker terminal to pull the official PostgreSQL Docker image  
"docker pull postgres"

3. Start docker container for PostgreSQL with username, password, database name, and port 5432 which is default port for PostgreSQL  
"docker run --name postgres-container -e POSTGRES_USER=soravit -e POSTGRES_PASSWORD=wetprasit -e POSTGRES_DB=ninfinite -p 5432:5432 -d postgres"

4. Check if docker container is running  
"docker ps"

5. Access the Running Container which is postgres  
"docker exec -it postgres-container bash"

6. Access PostgreSQL with username and database name  
"psql -h localhost -U soravit -d ninfinite"

7. List database inside PostgreSql  
"\l"

8. Connect to the database  
"\c ninfinite"

9. SQL statment to create table  
"CREATE TABLE user_info (  
    id SERIAL PRIMARY KEY,  
    age INTEGER,  
    gender CHAR(1),  
    location varchar(20)  
);"

10. Check if table in database created  
"\dt"

11. Exit psql  
"\q"

12. Exit docker  
"exit"

# After this step is using copy PostgreSQL to copy .csv direct to database
# There are python file below for ETL process

13. Copy .csv file from local to postgres-container  
"docker cp "C:/Users/Lenovo/jupyter/MyOwnProject/data-engineer-test_ N-Infinite/ref/user_info.csv" postgres-container:/tmp/user_info.csv"

14. Access the Running Container  
"docker exec -it postgres-container bash"

15. Access PostgreSQL with username and database name  
"psql -h localhost -U soravit -d ninfinite"

16. User PostgreSQL to copy .csv direct to database  
"COPY user_info (id, age, gender, location)
FROM '/tmp/user_info.csv'
DELIMITER ','
CSV HEADER;"

In [62]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Define database connection parameters
db_url = "postgresql://soravit:wetprasit@localhost:5432/ninfinite"

# Create a SQLAlchemy engine
engine = create_engine(db_url)

# Read the CSV file into a DataFrame
csv_file_path = "ref/user_info.csv"  
user_df = pd.read_csv(csv_file_path)
table_name = "user_info" 

# As user_id is primary key which need to be unique, make sure it has no duplicate by using whole count and unique count
if user_df["user_id"].nunique() == user_df["user_id"].count():
    print("No duplicate key")
    age_over_100 = user_df[user_df["age"] > 100]
    un_gender = user_df[(user_df["gender"] != 'M') & (user_df["gender"] != 'F')]
    if not age_over_100.empty:
        print("There are outlier of user age.")
        print(age_over_100)
    else:
        if not un_gender.empty:
            print("There are gender beside 'M' or 'F'.")
            print(un_gender)
        else :
            print("Gender is either 'M' or 'F'")
            # If there are no data quality error then Insert DataFrame into postgres
            user_df.to_sql(table_name, engine, if_exists="replace", index=False)
            print(f"Load data into PostgreSQL table '{table_name}' completed.")
else:
    print("Please cleansing user_id to have no duplicate as it declare as primary key")

No duplicate key
Gender is either 'M' or 'F'
Load data into PostgreSQL table 'user_info' completed.
