#### Phase 4 Data Mining

## Part A

In [None]:
import pandas as pd
import configparser
import sqlalchemy 
import sklearn
import matplotlib.pyplot as plt

In [None]:
# Retrieve data from database
config = configparser.ConfigParser()
config.read('../settings.ini')
db_config = config['DB CONFIGURATION']
try:
    conn_string = "postgresql://"+db_config['USER']+":"+db_config['PASSWORD']+"@"+db_config['HOST']+"/"+db_config['DB_NAME']
    conn_string = conn_string.replace("'","") 
    db = sqlalchemy.create_engine(conn_string, pool_pre_ping=True)
    conn2 = db.connect()
    fact_table_df = pd.read_sql(sql='Fact_Table',con=conn2)
    cinfo_df = pd.read_sql(sql='ContextInfo_Dimension',con=conn2)
    pkd_entry_df = pd.read_sql(sql='PokedexEntry_Dimension',con=conn2)
    conn2.close()
except Exception as e:
    print("\nError:",e)

Some general graphs showing off total stats

In [None]:

#seperate the pokemons by generation
df_tmp = pd.merge(fact_table_df, cinfo_df, on='ContextInfoKey', how='left')
plt.hist(df_tmp['TOTAL'], bins=50)
plt.show()
plt.boxplot(df_tmp['TOTAL'], patch_artist=True)
plt.show()

generation specific stats

In [None]:
df_tmp = pd.merge(fact_table_df, cinfo_df, on='ContextInfoKey', how='left')
 
plt.scatter(df_tmp['generation'], df_tmp['TOTAL'])
plt.xlabel('Generation')
plt.xticks(rotation=90)
plt.ylabel('Total stats')
plt.show()

df_tmp2 = df_tmp.groupby('generation')['TOTAL'].mean()
df_tmp2 = df_tmp2.reset_index()

df_tmp3 = df_tmp['generation'].value_counts()

type_avg = pd.merge(df_tmp2, df_tmp3, on='generation', how='left')
type_avg.sort_values(by='TOTAL', ascending=True, inplace=True)
print("average total stats for each generation")
print(type_avg)

Stats by type

In [None]:
df_tmp = pd.merge(fact_table_df, cinfo_df, on='ContextInfoKey', how='left')
df_tmp = pd.merge(df_tmp, pkd_entry_df, on='DexEntryKey', how='left') 
df_tmp.sort_values(by='TOTAL', ascending=True, inplace=True)
#plot the data in order that was previously sorted

fig  = plt.figure()
ax = fig.add_subplot(111)
fig.set_size_inches(15, 5)

ax.scatter(df_tmp['type1'], df_tmp['TOTAL'], color='r')

#remove all None values from type2
df_tmp = df_tmp.fillna('None')


ax.scatter(df_tmp['type2'], df_tmp['TOTAL'], color='b')
plt.legend(loc = 'upper left')

plt.show()

#calculate the average of the total stats for each type
df_tmp2 = df_tmp.groupby('type1')['TOTAL'].mean(df_tmp.groupby('type2')['TOTAL'])
df_tmp2 = df_tmp2.reset_index()

a = df_tmp['type1'].value_counts() 
b = df_tmp['type2'].value_counts()

c = pd.Series(a + b)
#name the columns of the series
c = c.reset_index()
c.columns = ['type1', 'amount']

type_avg = pd.merge(df_tmp2, c, on='type1', how='left')


type_avg.sort_values(by='TOTAL', ascending=True, inplace=True)
print("average total stats for each type")
print(type_avg)





Sort by rank

In [None]:
df_tmp = pd.merge(fact_table_df, cinfo_df, on='ContextInfoKey', how='left')
df_tmp = pd.merge(df_tmp, pkd_entry_df, on='DexEntryKey', how='left') 

plt.scatter(df_tmp['rank'], df_tmp['TOTAL'])
plt.show()

df_tmp2 = df_tmp.groupby('rank')['TOTAL'].mean()
df_tmp2 = df_tmp2.reset_index()

df_tmp3 = df_tmp['rank'].value_counts()

type_avg = pd.merge(df_tmp2, df_tmp3, on='rank', how='left')
type_avg.sort_values(by='TOTAL', ascending=True, inplace=True)
print("average total stats for each rank")
print(type_avg)

Individual stat comparison

In [None]:
import numpy as np
df_tmp = pd.merge(fact_table_df, cinfo_df, on='ContextInfoKey', how='left')
df_tmp = pd.merge(df_tmp, pkd_entry_df, on='DexEntryKey', how='left') 


fig, ax = plt.subplots(nrows=6, ncols=1, sharex=True, sharey=True)
fig.set_figheight(20)
fig.set_figwidth(10)

ax[0].scatter(df_tmp['HP'], df_tmp['generation'])
ax[0].set_title('HP per Generation')
ax[1].scatter(df_tmp['ATK'], df_tmp['generation'])
ax[1].set_title('ATK per Generation')
ax[2].scatter(df_tmp['DEF'], df_tmp['generation'])
ax[2].set_title('DEF per Generation')
ax[3].scatter(df_tmp['SPATK'], df_tmp['generation'])
ax[3].set_title('SPATK per Generation')
ax[4].scatter(df_tmp['SPDEF'], df_tmp['generation'])
ax[4].set_title('SPDEF per Generation')
ax[5].scatter(df_tmp['SPEED'], df_tmp['generation'])
ax[5].set_title('SPEED per Generation')

plt.show()




## Part B

In [None]:
#pip install and imports here
%pip install requests
import requests
import csv

Download competitive pokemon data from https://smogonapi.herokuapp.com and save it to a csv file

In [None]:
#Download competitive pokemon data
generations = ["rb","gs","rs","dp","bw","xy","sm","ss","sv"]
smogonapi_getPokemonByGen_url = "https://smogonapi.herokuapp.com/GetPokemonByGen/"
rqst = smogonapi_getPokemonByGen_url+generations[-1]
response = requests.get(rqst)
response_json = response.json()

In [None]:
#Save response to csv file
print(len(response_json))
print(response_json)

csv_file = open('..\data\extended datasets\competitive_df_pokemon.csv', 'w',newline='')
writer = csv.writer(csv_file)


count = 0
for data in response_json:
    if count == 0:
        header = data.keys()
        writer.writerow(header)
        count += 1
    writer.writerow(data.values())
    
csv_file.close()

Clean up the competitive data into a data frame

In [None]:
import pandas as pd

competitive_pokemon_df = pd.read_csv("../data/extended datasets/competitive_df_pokemon.csv")
print(competitive_pokemon_df)

columns_to_drop = ['hp','atk','def','spa','spd','spe','weight','height','types','abilities']
competitive_pokemon_df = competitive_pokemon_df.drop(columns_to_drop,axis=1)
print(competitive_pokemon_df)
competitive_pokemon_df = competitive_pokemon_df[competitive_pokemon_df['isNonstandard'] != 'CAP']
print(competitive_pokemon_df)

## Part C