## Data Staging process for Phase 2 of CSI4142 Project

### 1. Extraction

In [None]:
# pip install and imports here
import pandas as pd

In [None]:
# Read base dataset in a pandas data frame and print
base_pokemon_df = pd.read_csv("data/base_df_pokemon.csv")
print(base_pokemon_df)

In [None]:
print("Now we verify the types pandas assigned to our columns")
print(base_pokemon_df.dtypes)
print("dtype = object signifies a string")

### 2. Transformation

In [None]:
#pip install and imports here

In [None]:
#create needed columns for transformation
print("Creating DexEntry Key column...")
base_pokemon_df["DexEntryKey"] =  base_pokemon_df["id"].astype(str)+base_pokemon_df["generation"].str.replace("generation","")

print("Creating ContextInfo key column...")
base_pokemon_df["ContextInfoKey"] = base_pokemon_df["rank"]+base_pokemon_df["generation"].str.replace("generation","")
#print(base_pokemon_df)

We create dataframes that match our model's dimensions

In [None]:
print("Pokedex Entry dimension\n")

pkd_entry_dim_cols = ["DexEntryKey","id","name","evolves_from","type1","type2","height","weight","abilities"]
pkd_entry_df = base_pokemon_df[pkd_entry_dim_cols]

print("Renaming the columns to match our model...")
pkd_entry_cols_names = {"id": "pokedex Id", "height":"height_cm", "weight" : "weight_kg"}
pkd_entry_df= pkd_entry_df.rename(columns=pkd_entry_cols_names, errors="raise")

print("Converting height and weight from decameter and dekagram to centimeters and kilogram...")
pkd_entry_df["height_cm"] *= 10
pkd_entry_df["weight_kg"] *= 0.1


print("\n")
print(pkd_entry_df)

In [None]:
print("isVariant column\n")

df_variant = pd.read_csv('data/extended datasets/bridge_pokemon_pokemon_HAS_VARIANT.csv')
unique_variant = df_variant['Non-Variant Pkm Name'].unique()
unique_variant_lower = list(map(str.lower, unique_variant))
print(unique_variant_lower)


In [None]:
#  check if the name exist in variant csv file
pkd_entry_df['is_there_variant'] = pkd_entry_df['name'].str.contains('|'.join(unique_variant_lower))
pkd_entry_df

In [None]:
print("ContextInfo dimension\n")
cinfo_dim_cols = ["ContextInfoKey","rank","generation"]
cinfo_df = base_pokemon_df[cinfo_dim_cols]

print("Dropping duplicates...")
cinfo_df.drop_duplicates(subset=["ContextInfoKey"],inplace=True)

print("Reseting indexes after removing duplicates...")
cinfo_df.reset_index(drop=True,inplace=True)
print(cinfo_df)

We create a dataframe for our fact table

In [None]:
print("Fact table\n")
fact_table_cols = ["DexEntryKey","ContextInfoKey","hp", "atk", "def", "spatk","spdef","speed","total"]
fact_table_df = base_pokemon_df[fact_table_cols]

print("Renaming columns to match our model...")
fact_cols_names = {"hp": "HP", "atk":"ATK", "def" : "DEF", "spatk":"SPATK","spdef":"SPDEF", "speed": "SPEED", "total":"TOTAL"}
fact_table_df= fact_table_df.rename(columns=fact_cols_names, errors="raise")

print("Checking for null values...")
print(fact_table_df.isnull().sum())

print("\n")
print(fact_table_df)

In [None]:
# If we want to add data from the datasets in data\extended datasets we can do that here

### 3. Loading

In [None]:
#pip install and imports here
%pip install psycopg2-binary
%pip install SQLAlchemy

import psycopg2
import configparser
import sqlalchemy 

Retrieving configuration and connecting to the database

In [None]:
print("Retrieving configuration...")
config = configparser.ConfigParser()
config.read('settings.ini')
db_config = config['DB CONFIGURATION']

print("Connecting to the database...")

try:
    conn = psycopg2.connect(
        "dbname="+ db_config['DB_NAME']+
        " host="+db_config['HOST']+
        " user="+ db_config['USER']+
        " password="+ db_config['PASSWORD']+
        " port="+db_config['PORT']
    )
    cursor = conn.cursor()
    print(conn)
    print(cursor)
    print("Connected to the database !")
except psycopg2.OperationalError as e:
    #print("Error connecting to the database!\n{0}").format(e)
    print("Failed to connect to the database")
    import sys
    err_type, err_obj, traceback = sys.exc_info()
    print("\npsycopg error:",e)
    print ("\npsycopg2 traceback:", traceback, "-- type:", err_type)
    print ("\nextensions.Diagnostics:", e.diag)


SQL Queries to load our data

1. Verifying that our tables exist and creating them if they do not


In [None]:
print("Checking for tables...")
try:
    print("Checking ContextInfo_Dimension")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "ContextInfo_Dimension ("+
        "ContextInfoKey varchar(255), "+
        "Rank varchar(255), "+
        "Generation varchar(255), "+
        "PRIMARY KEY (ContextInfoKey) "+
        ");"
    )
    
    print("Checking PokedexEntry_Dimension")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "PokedexEntry_Dimension ("+
        "DexEntryKey varchar(255), "+
        "PokedexId INT, "+
        "Name varchar(255), "+
        "EvolvesFrom varchar(255), "+
        "Type1 varchar(255), "+
        "Type2 varchar(255), "+
        "Height_cm FLOAT, "+
        "Weight_kg FLOAT, "+
        "Abilities varchar(255), "+
        "PRIMARY KEY (DexEntryKey) "+
        ");"
    )
    
    print("Checking Fact Table")
    cursor.execute(
        "CREATE TABLE IF NOT EXISTS "+
        "Fact_Table ("+
        "DexEntryKey varchar(255), "+
        "ContextInfoKey varchar(255), "+
        "HP INT, "+
        "ATK INT, "+
        "DEF INT, "+
        "SPATK INT, "+
        "SPDEF INT, "+
        "SPEED INT, "+
        "TOTAL INT, "+
        "FOREIGN KEY (DexEntryKey) REFERENCES PokedexEntry_Dimension(DexEntryKey), "+
        "FOREIGN KEY (ContextInfoKey) REFERENCES ContextInfo_Dimension(ContextInfoKey) "+
        ");"
    )
    
    cursor.execute("SELECT * FROM pg_catalog.pg_tables;")
    print(cursor.fetchall())
except Exception as e:
    print("\nError:",e)

2. Pushing our datasets to the appropriate tables

In [None]:
%pip install psycopg2 
try:
    conn_string = "postgresql://"+db_config['USER']+":"+db_config['PASSWORD']+"@"+db_config['HOST']+"/"+db_config['DB_NAME']
    conn_string = conn_string.replace("'","") 
    db = sqlalchemy.create_engine(conn_string, pool_pre_ping=True)
    conn2 = db.connect()
    print(conn2)
    #cinfo_df.to_sql(name='ContextInfo_Dimension',con=conn,if_exists='append')
except Exception as e:
    print("\nError:",e)
    

In [None]:


cinfo_df.to_sql('ContextInfo_Dimension',conn2,if_exists='append')

pkd_entry_df.to_sql('PokedexEntry_Dimension',conn2,if_exists='append')
fact_table_df.to_sql('Fact_Table',conn2,if_exists='append')

In [None]:
cursor.execute('SELECT * FROM "ContextInfo_Dimension";')
print(cursor.fetchall())
cursor.execute('SELECT * FROM "PokedexEntry_Dimension";')
print(cursor.fetchall())
cursor.execute('SELECT * FROM "Fact_Table";')
print(cursor.fetchall())


In [None]:
cursor.execute('DROP TABLE IF EXISTS "ContextInfo_Dimension";')
cursor.execute('DROP TABLE IF EXISTS "PokedexEntry_Dimension";')
cursor.execute('DROP TABLE IF EXISTS "Fact_Table";')

3. Making aggregates if needed

Closing operations

In [None]:
conn.close()
conn2.close()