## Data Staging process for Phase 2 of CSI4142 Project

### 1. Extraction

In [43]:
# pip install and imports here
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [44]:
# Read base dataset in a pandas data frame and print
base_pokemon_df = pd.read_csv("data/base_df_pokemon.csv")
print(base_pokemon_df)

        id          name       rank     generation evolves_from     type1  \
0        1     bulbasaur   ordinary   generation-i      nothing     grass   
1        2       ivysaur   ordinary   generation-i    bulbasaur     grass   
2        3      venusaur   ordinary   generation-i      ivysaur     grass   
3        4    charmander   ordinary   generation-i      nothing      fire   
4        5    charmeleon   ordinary   generation-i   charmander      fire   
...    ...           ...        ...            ...          ...       ...   
1020  1021   raging-bolt   ordinary  generation-ix      nothing  electric   
1021  1022  iron-boulder   ordinary  generation-ix      nothing      rock   
1022  1023    iron-crown   ordinary  generation-ix      nothing     steel   
1023  1024     terapagos  legendary  generation-ix      nothing    normal   
1024  1025     pecharunt   mythical  generation-ix      nothing    poison   

        type2   hp  atk  def  spatk  spdef  speed  total  height  weight  \

In [None]:
print("Now we verify the types pandas assigned to our columns")
print(base_pokemon_df.dtypes)
print("dtype = object signifies a string")

### 2. Transformation

In [None]:
#pip install and imports here

In [46]:
#create needed columns for transformation
print("Creating DexEntry Key column...")
base_pokemon_df["dexEntry key"] =  base_pokemon_df["id"].astype(str)+base_pokemon_df["generation"].str.replace("generation","")

print("Creating ContextInfo key column...")
base_pokemon_df["contextInfo key"] = base_pokemon_df["rank"]+base_pokemon_df["generation"].str.replace("generation","")
#print(base_pokemon_df)

Creating DexEntry Key column...
Creating ContextInfo key column...


We create dataframes that match our model's dimensions

In [None]:
print("Pokedex Entry dimension\n")

pkd_entry_dim_cols = ["dexEntry key","id","name","evolves_from","type1","type2","height","weight","abilities"]
pkd_entry_df = base_pokemon_df[pkd_entry_dim_cols]

print("Renaming the columns to match our model...")
pkd_entry_cols_names = {"id": "pokedex Id", "height":"height(cm)", "weight" : "weight(kg)"}
pkd_entry_df= pkd_entry_df.rename(columns=pkd_entry_cols_names, errors="raise")

print("Converting height and weight from decameter and dekagram to centimeters and kilogram...")
pkd_entry_df["height(cm)"] *= 10
pkd_entry_df["weight(kg)"] *= 0.1

print("\n")
print(pkd_entry_df)

In [None]:
print("ContextInfo dimension\n")
cinfo_dim_cols = ["contextInfo key","rank","generation"]
cinfo_df = base_pokemon_df[cinfo_dim_cols]

print("Dropping duplicates...")
cinfo_df.drop_duplicates(subset=["contextInfo key"],inplace=True)

print("Reseting indexes after removing duplicates...")
cinfo_df.reset_index(drop=True,inplace=True)
print(cinfo_df)

We create a dataframe for our fact table

In [47]:
print("Fact table\n")
fact_table_cols = ["dexEntry key","contextInfo key","hp", "atk", "def", "spatk","spdef","speed","total"]
fact_table_df = base_pokemon_df[fact_table_cols]

print("Renaming columns to match our model...")
fact_cols_names = {"hp": "Health Points", "atk":"Attack", "def" : "Defense", "spatk":"Special Attack","spdef":"Special Defense", "speed": "Speed", "total":"Total"}
fact_table_df= fact_table_df.rename(columns=fact_cols_names, errors="raise")

print("Checking for null values...")
print(fact_table_df.isnull().sum())

print("\n")
print(fact_table_df)

Fact table

Renaming columns to match our model...
Checking for null values...
dexEntry key       0
contextInfo key    0
Health Points      0
Attack             0
Defense            0
Special Attack     0
Special Defense    0
Speed              0
Total              0
dtype: int64


     dexEntry key contextInfo key  Health Points  Attack  Defense  \
0             1-i      ordinary-i             45      49       49   
1             2-i      ordinary-i             60      62       63   
2             3-i      ordinary-i             80      82       83   
3             4-i      ordinary-i             39      52       43   
4             5-i      ordinary-i             58      64       58   
...           ...             ...            ...     ...      ...   
1020      1021-ix     ordinary-ix            125      73       91   
1021      1022-ix     ordinary-ix             90     120       80   
1022      1023-ix     ordinary-ix             90      72      100   
1023      1024-ix    legend

In [None]:
# If we want to add data from the datasets in data\extended datasets we can do that here

### 3. Loading

In [7]:
#pip install and imports here
%pip install psycopg2-binary

import psycopg2
import configparser

Note: you may need to restart the kernel to use updated packages.




Retrieving configuration and connecting to the database

In [53]:
print("Retrieving configuration...")
config = configparser.ConfigParser()
config.read('settings.ini')
db_config = config['DB CONFIGURATION']

print("Connecting to the database...")

try:
    conn = psycopg2.connect(
        "dbname="+ db_config['DB_NAME']+
        " host="+db_config['HOST']+
        " user="+ db_config['USER']+
        " password="+ db_config['PASSWORD']+
        " port="+db_config['PORT']
    )
    cursor = conn.cursor()
    print(conn)
    print(cursor)
    print("Connected to the database !")
except psycopg2.OperationalError as e:
    #print("Error connecting to the database!\n{0}").format(e)
    print("Failed to connect to the database")
    import sys
    err_type, err_obj, traceback = sys.exc_info()
    print("\npsycopg error:",e)
    print ("\npsycopg2 traceback:", traceback, "-- type:", err_type)
    print ("\nextensions.Diagnostics:", e.diag)


Retrieving configuration...
Connecting to the database...
<connection object at 0x000001FEE0843340; dsn: 'user=postgres password=xxx dbname=postgres host=localhost port=5432', closed: 0>
<cursor object at 0x000001FEE3B103C0; closed: 0>
Connected to the database !


SQL Queries to load our data

1. Verifying that our tables exist and creating them if they do not


In [54]:
print("Checking for tables...")
try:
    print("Checking ContextInfo_Dimension")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "ContextInfo_Dimension ("+
        "ContextInfoKey varchar(255), "+
        "Rank varchar(255), "+
        "Generation varchar(255), "+
        "PRIMARY KEY (ContextInfoKey) "+
        ");"
    )
    
    print("Checking PokedexEntry_Dimension")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "PokedexEntry_Dimension ("+
        "DexEntryKey varchar(255), "+
        "PokedexId INT, "+
        "Name varchar(255), "+
        "EvolvesFrom varchar(255), "+
        "Type1 varchar(255), "+
        "Type2 varchar(255), "+
        "Height_cm FLOAT, "+
        "Weight_kg FLOAT, "+
        "Abilities varchar(255), "+
        "PRIMARY KEY (DexEntryKey) "+
        ");"
    )
    
    print("Checking Fact Table")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "Fact_Table ("+
        "DexEntryKey varchar(255), "+
        "ContextInfoKey varchar(255), "+
        "HP INT, "+
        "ATK INT, "+
        "DEF INT, "+
        "SPATK INT, "+
        "SPDEF INT, "+
        "SPEED INT, "+
        "TOTAL INT, "+
        "FOREIGN KEY (DexEntryKey) REFERENCES PokedexEntry_Dimension(DexEntryKey), "+
        "FOREIGN KEY (ContextInfoKey) REFERENCES ContextInfo_Dimension(ContextInfoKey) "+
        ");"
    )
    
    cursor.execute("SELECT * FROM pg_catalog.pg_tables;")
    print(cursor.fetchall())
except Exception as e:
    print("\nError:",e)

Checking for tables...
Checking ContextInfo_Dimension
Checking PokedexEntry_Dimension
Checking Fact Table
[('public', 'pokedexentry_dimension', 'postgres', None, True, False, True, False), ('public', 'fact_table', 'postgres', None, False, False, True, False), ('public', 'contextinfo_dimension', 'postgres', None, True, False, True, False), ('pg_catalog', 'pg_statistic', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_type', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_foreign_table', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_authid', 'postgres', 'pg_global', True, False, False, False), ('pg_catalog', 'pg_statistic_ext_data', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_user_mapping', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_subscription', 'postgres', 'pg_global', True, False, False, False), ('pg_catalog', 'pg_attribute', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg

2. Pushing our datasets to the appropriate tables

3. Making aggregates if needed