## Data Staging process for Phase 2 of CSI4142 Project

### 1. Extraction

In [1]:
# pip install and imports here
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Read base dataset in a pandas data frame and print
base_pokemon_df = pd.read_csv("data/base_df_pokemon.csv")
print(base_pokemon_df)

        id          name       rank     generation evolves_from     type1  \
0        1     bulbasaur   ordinary   generation-i      nothing     grass   
1        2       ivysaur   ordinary   generation-i    bulbasaur     grass   
2        3      venusaur   ordinary   generation-i      ivysaur     grass   
3        4    charmander   ordinary   generation-i      nothing      fire   
4        5    charmeleon   ordinary   generation-i   charmander      fire   
...    ...           ...        ...            ...          ...       ...   
1020  1021   raging-bolt   ordinary  generation-ix      nothing  electric   
1021  1022  iron-boulder   ordinary  generation-ix      nothing      rock   
1022  1023    iron-crown   ordinary  generation-ix      nothing     steel   
1023  1024     terapagos  legendary  generation-ix      nothing    normal   
1024  1025     pecharunt   mythical  generation-ix      nothing    poison   

        type2   hp  atk  def  spatk  spdef  speed  total  height  weight  \

In [3]:
print("Now we verify the types pandas assigned to our columns")
print(base_pokemon_df.dtypes)
print("dtype = object signifies a string")

Now we verify the types pandas assigned to our columns
id               int64
name            object
rank            object
generation      object
evolves_from    object
type1           object
type2           object
hp               int64
atk              int64
def              int64
spatk            int64
spdef            int64
speed            int64
total            int64
height           int64
weight           int64
abilities       object
desc            object
dtype: object
dtype = object signifies a string


### 2. Transformation

In [4]:
#pip install and imports here

In [5]:
#create needed columns for transformation
print("Creating DexEntry Key column...")
base_pokemon_df["DexEntryKey"] =  base_pokemon_df["id"].astype(str)+base_pokemon_df["generation"].str.replace("generation","")

print("Creating ContextInfo key column...")
base_pokemon_df["ContextInfoKey"] = base_pokemon_df["rank"]+base_pokemon_df["generation"].str.replace("generation","")
#print(base_pokemon_df)

Creating DexEntry Key column...
Creating ContextInfo key column...


We create dataframes that match our model's dimensions

In [6]:
print("Pokedex Entry dimension\n")

pkd_entry_dim_cols = ["DexEntryKey","id","name","evolves_from","type1","type2","height","weight","abilities"]
pkd_entry_df = base_pokemon_df[pkd_entry_dim_cols]

print("Renaming the columns to match our model...")
pkd_entry_cols_names = {"id": "pokedex Id", "height":"height_cm", "weight" : "weight_kg"}
pkd_entry_df= pkd_entry_df.rename(columns=pkd_entry_cols_names, errors="raise")

print("Converting height and weight from decameter and dekagram to centimeters and kilogram...")
pkd_entry_df["height_cm"] *= 10
pkd_entry_df["weight_kg"] *= 0.1

print("\n")
print(pkd_entry_df)

Pokedex Entry dimension

Renaming the columns to match our model...
Converting height and weight from decameter and dekagram to centimeters and kilogram...


     DexEntryKey  pokedex Id          name evolves_from     type1    type2  \
0            1-i           1     bulbasaur      nothing     grass   poison   
1            2-i           2       ivysaur    bulbasaur     grass   poison   
2            3-i           3      venusaur      ivysaur     grass   poison   
3            4-i           4    charmander      nothing      fire      NaN   
4            5-i           5    charmeleon   charmander      fire      NaN   
...          ...         ...           ...          ...       ...      ...   
1020     1021-ix        1021   raging-bolt      nothing  electric   dragon   
1021     1022-ix        1022  iron-boulder      nothing      rock  psychic   
1022     1023-ix        1023    iron-crown      nothing     steel  psychic   
1023     1024-ix        1024     terapagos      nothing    nor

In [7]:
print("ContextInfo dimension\n")
cinfo_dim_cols = ["ContextInfoKey","rank","generation"]
cinfo_df = base_pokemon_df[cinfo_dim_cols]

print("Dropping duplicates...")
cinfo_df.drop_duplicates(subset=["ContextInfoKey"],inplace=True)

print("Reseting indexes after removing duplicates...")
cinfo_df.reset_index(drop=True,inplace=True)
print(cinfo_df)

ContextInfo dimension

Dropping duplicates...
Reseting indexes after removing duplicates...
    ContextInfoKey       rank       generation
0       ordinary-i   ordinary     generation-i
1      legendary-i  legendary     generation-i
2       mythical-i   mythical     generation-i
3      ordinary-ii   ordinary    generation-ii
4          baby-ii       baby    generation-ii
5     legendary-ii  legendary    generation-ii
6      mythical-ii   mythical    generation-ii
7     ordinary-iii   ordinary   generation-iii
8         baby-iii       baby   generation-iii
9    legendary-iii  legendary   generation-iii
10    mythical-iii   mythical   generation-iii
11     ordinary-iv   ordinary    generation-iv
12         baby-iv       baby    generation-iv
13    legendary-iv  legendary    generation-iv
14     mythical-iv   mythical    generation-iv
15      mythical-v   mythical     generation-v
16      ordinary-v   ordinary     generation-v
17     legendary-v  legendary     generation-v
18     ordinary

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cinfo_df.drop_duplicates(subset=["ContextInfoKey"],inplace=True)


We create a dataframe for our fact table

In [8]:
print("Fact table\n")
fact_table_cols = ["DexEntryKey","ContextInfoKey","hp", "atk", "def", "spatk","spdef","speed","total"]
fact_table_df = base_pokemon_df[fact_table_cols]

print("Renaming columns to match our model...")
fact_cols_names = {"hp": "HP", "atk":"ATK", "def" : "DEF", "spatk":"SPATK","spdef":"SPDEF", "speed": "SPEED", "total":"TOTAL"}
fact_table_df= fact_table_df.rename(columns=fact_cols_names, errors="raise")

print("Checking for null values...")
print(fact_table_df.isnull().sum())

print("\n")
print(fact_table_df)

Fact table

Renaming columns to match our model...
Checking for null values...
DexEntryKey       0
ContextInfoKey    0
HP                0
ATK               0
DEF               0
SPATK             0
SPDEF             0
SPEED             0
TOTAL             0
dtype: int64


     DexEntryKey ContextInfoKey   HP  ATK  DEF  SPATK  SPDEF  SPEED  TOTAL
0            1-i     ordinary-i   45   49   49     65     65     45    318
1            2-i     ordinary-i   60   62   63     80     80     60    405
2            3-i     ordinary-i   80   82   83    100    100     80    525
3            4-i     ordinary-i   39   52   43     60     50     65    309
4            5-i     ordinary-i   58   64   58     80     65     80    405
...          ...            ...  ...  ...  ...    ...    ...    ...    ...
1020     1021-ix    ordinary-ix  125   73   91    137     89     75    590
1021     1022-ix    ordinary-ix   90  120   80     68    108    124    590
1022     1023-ix    ordinary-ix   90   72  100    1

In [9]:
# If we want to add data from the datasets in data\extended datasets we can do that here

### 3. Loading

In [10]:
#pip install and imports here
%pip install psycopg2-binary
%pip install SQLAlchemy

import psycopg2
import configparser
import sqlalchemy 

Note: you may need to restart the kernel to use updated packages.









Retrieving configuration and connecting to the database

In [11]:
print("Retrieving configuration...")
config = configparser.ConfigParser()
config.read('settings.ini')
db_config = config['DB CONFIGURATION']

print("Connecting to the database...")

try:
    conn = psycopg2.connect(
        "dbname="+ db_config['DB_NAME']+
        " host="+db_config['HOST']+
        " user="+ db_config['USER']+
        " password="+ db_config['PASSWORD']+
        " port="+db_config['PORT']
    )
    cursor = conn.cursor()
    print(conn)
    print(cursor)
    print("Connected to the database !")
except psycopg2.OperationalError as e:
    #print("Error connecting to the database!\n{0}").format(e)
    print("Failed to connect to the database")
    import sys
    err_type, err_obj, traceback = sys.exc_info()
    print("\npsycopg error:",e)
    print ("\npsycopg2 traceback:", traceback, "-- type:", err_type)
    print ("\nextensions.Diagnostics:", e.diag)


Retrieving configuration...
Connecting to the database...
<connection object at 0x0000012C58CA57A0; dsn: 'user=postgres password=xxx dbname=postgres host=localhost port=5432', closed: 0>
<cursor object at 0x0000012C58DBCF20; closed: 0>
Connected to the database !


SQL Queries to load our data

1. Verifying that our tables exist and creating them if they do not


In [12]:
print("Checking for tables...")
try:
    print("Checking ContextInfo_Dimension")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "ContextInfo_Dimension ("+
        "ContextInfoKey varchar(255), "+
        "Rank varchar(255), "+
        "Generation varchar(255), "+
        "PRIMARY KEY (ContextInfoKey) "+
        ");"
    )
    
    print("Checking PokedexEntry_Dimension")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "PokedexEntry_Dimension ("+
        "DexEntryKey varchar(255), "+
        "PokedexId INT, "+
        "Name varchar(255), "+
        "EvolvesFrom varchar(255), "+
        "Type1 varchar(255), "+
        "Type2 varchar(255), "+
        "Height_cm FLOAT, "+
        "Weight_kg FLOAT, "+
        "Abilities varchar(255), "+
        "PRIMARY KEY (DexEntryKey) "+
        ");"
    )
    
    print("Checking Fact Table")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "Fact_Table ("+
        "DexEntryKey varchar(255), "+
        "ContextInfoKey varchar(255), "+
        "HP INT, "+
        "ATK INT, "+
        "DEF INT, "+
        "SPATK INT, "+
        "SPDEF INT, "+
        "SPEED INT, "+
        "TOTAL INT, "+
        "FOREIGN KEY (DexEntryKey) REFERENCES PokedexEntry_Dimension(DexEntryKey), "+
        "FOREIGN KEY (ContextInfoKey) REFERENCES ContextInfo_Dimension(ContextInfoKey) "+
        ");"
    )
    
    cursor.execute("SELECT * FROM pg_catalog.pg_tables;")
    print(cursor.fetchall())
except Exception as e:
    print("\nError:",e)

Checking for tables...
Checking ContextInfo_Dimension
Checking PokedexEntry_Dimension
Checking Fact Table
[('pg_catalog', 'pg_statistic', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_type', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_foreign_table', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_authid', 'postgres', 'pg_global', True, False, False, False), ('pg_catalog', 'pg_statistic_ext_data', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_user_mapping', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_subscription', 'postgres', 'pg_global', True, False, False, False), ('pg_catalog', 'pg_attribute', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_proc', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_class', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_attrdef', 'postgres', None, True, False, False, False), ('pg_catalog', 'pg_constraint', 

2. Pushing our datasets to the appropriate tables

In [13]:
%pip install psycopg2
try:
    conn_string = 'postgresql://'+db_config['USER']+':'+db_config['PASSWORD']+'@'+db_config['HOST']+'/'+db_config['DB_NAME']
    db = sqlalchemy.create_engine(conn_string, pool_pre_ping=True)
    print(db)
    conn2 = db.connect()
except Exception as e:
    print("\nError:",e)
    

Note: you may need to restart the kernel to use updated packages.
Engine(postgresql://%27postgres%27:***@'localhost'/'postgres')

Error: (psycopg2.OperationalError) 
(Background on this error at: https://sqlalche.me/e/20/e3q8)




In [None]:
sys.exit(1)

cinfo_df.to_sql('ContextInfo_Dimension',conn2,if_exists='append')
pkd_entry_df.to_sql('PokedexEntry_Dimension',conn2,if_exists='append')
fact_table_df.to_sql('Fact_Table',conn2,if_exists='append')

3. Making aggregates if needed

Closing operations

In [None]:
conn.close()