# Import Data

<br>
<br>

Below are brief explanations of what data we are importing and what these variables represent
* `ALL_DATA` _(data frame)_ - is our main data set 
* `poke_types` _(data frame)_ - is a seperate data set that contains a mapping of Pokemon IDs to pokemon name and types. This will be important as our main dependant variable will be **Pokemon type**
* `pokemonId_ALL` _(list/array)_ - of all unique pokemon IDs that exist in `ALL_DATA` from smallest to biggest number. 

<br>
<br>
<br>
<br>


In [1]:
import pandas as pd # data frames
import numpy as np # ____number generation
import statsmodels.formula.api as smf # for linear modeling
import matplotlib.pyplot as plt # plotting

In [2]:
ALL_DATA = pd.read_csv('data/300k.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
poke_types = pd.read_csv('data/pokeId.csv')
poke_types = poke_types[['#', "Name", "Type 1", "Type 2"]]

pokemonId_ALL = set(ALL_DATA.pokemonId)

In [9]:
# Create a mapping of IDs to type. We need to do this in order to add the
# appropriate type to each pokemon in our main data set.

pokeId_toType = {}
for index, row in poke_types.iterrows():
    if row["#"] in pokemonId_ALL: #only add poke that are in main data set
        pokeId_toType[row["#"]] = [row["Name"] ,row["Type 1"], row["Type 2"]]
        
        
pokeId_toType

{1: ['Bulbasaur', 'Grass', 'Poison'],
 2: ['Ivysaur', 'Grass', 'Poison'],
 3: ['VenusaurMega Venusaur', 'Grass', 'Poison'],
 4: ['Charmander', 'Fire', nan],
 5: ['Charmeleon', 'Fire', nan],
 6: ['CharizardMega Charizard Y', 'Fire', 'Flying'],
 7: ['Squirtle', 'Water', nan],
 8: ['Wartortle', 'Water', nan],
 9: ['BlastoiseMega Blastoise', 'Water', nan],
 10: ['Caterpie', 'Bug', nan],
 11: ['Metapod', 'Bug', nan],
 12: ['Butterfree', 'Bug', 'Flying'],
 13: ['Weedle', 'Bug', 'Poison'],
 14: ['Kakuna', 'Bug', 'Poison'],
 15: ['BeedrillMega Beedrill', 'Bug', 'Poison'],
 16: ['Pidgey', 'Normal', 'Flying'],
 17: ['Pidgeotto', 'Normal', 'Flying'],
 18: ['PidgeotMega Pidgeot', 'Normal', 'Flying'],
 19: ['Rattata', 'Normal', nan],
 20: ['Raticate', 'Normal', nan],
 21: ['Spearow', 'Normal', 'Flying'],
 22: ['Fearow', 'Normal', 'Flying'],
 23: ['Ekans', 'Poison', nan],
 24: ['Arbok', 'Poison', nan],
 25: ['Pikachu', 'Electric', nan],
 26: ['Raichu', 'Electric', nan],
 27: ['Sandshrew', 'Ground', 

In [13]:
        
# Add the new columns
ALL_DATA["Name"] = ""
ALL_DATA["Type"] = "" 

In [None]:
# Update empty columns
total_rows = len(ALL_DATA)
for index, row in ALL_DATA.iterrows():
    
    printPercent(index, 200, total_rows)
    tempTypes = pokeId_toType[row["pokemonId"]]
    ALL_DATA.loc[index, ["Name", "Type"]] = tempTypes[0], tempTypes[1]

<br>
<br>
<br>

## Small Helper Functions

In [32]:
def printAllFeatures():
    cols = ALL_DATA.columns
    for col in cols:
        print(col)

In [7]:
def printPercent(index, breakPoint, total):
    if (index % breakPoint) == 0:
        percent = round(100 * index / total, 2)
        print(str(percent)  + "% done")


<br>
<br>
<br>

## Correlations
Visualize the correlations of features to our dependant variable: pokemon type.

In [None]:
from matplotlib.pyplot import figure

correlation = ALL_DATA.drop([''], axis=1) # .corr()
corr_sorted = correlation.sort_values(by=['total_cases'])
features = list(correlation.columns.values)
corr_nums = list(correlation.total_cases)

figure(figsize=(10,10))
plt.barh(features, corr_nums, align='center', alpha=0.5)
plt.yticks(features, features)
plt.xlabel('Correlation')
plt.title('Correlation of environmental variables to total cases of Dengue')

plt.show()

<br>
<br>
<br>

### Pokemon Types based on weather features

In [12]:
col_pokeID = ["pokemonId"]
cols_time = ["appearedTimeOfDay", "appearedHour", "appearedMinute", "appearedDayOfWeek", "appearedDay", "appearedYear"]
cols_weather = ["weather", "windSpeed", "windBearing", "pressure", "weatherIcon", "sunriseMinutesMidnight", "sunriseHour", "sunriseMinute", "sunriseMinutesSince", "sunsetMinutesMidnight", "sunsetHour", "sunsetMinute", "sunsetMinutesBefore"]

cols = col_pokeID + cols_weather
geo_weather_data = ALL_DATA[cols]
geo_weather_data

Unnamed: 0,pokemonId,weather,windSpeed,windBearing,pressure,weatherIcon,sunriseMinutesMidnight,sunriseHour,sunriseMinute,sunriseMinutesSince,sunsetMinutesMidnight,sunsetHour,sunsetMinute,sunsetMinutesBefore
0,16,Foggy,4.79,269,1018.02,fog,436,7,16,941,1181,19,41,-196
1,133,Foggy,4.79,269,1018.02,fog,436,7,16,941,1181,19,41,-196
2,16,Clear,4.29,218,1015.29,clear-night,404,6,44,1033,1171,19,31,-266
3,13,PartlyCloudy,5.84,160,1020.52,partly-cloudy-night,398,6,38,858,1179,19,39,-77
4,133,PartlyCloudy,5.84,160,1020.52,partly-cloudy-night,398,6,38,858,1179,19,39,-77
5,21,PartlyCloudy,6.39,218,1024.44,partly-cloudy-day,385,6,25,330,1085,18,5,370
6,66,PartlyCloudy,6.40,218,1024.45,partly-cloudy-day,385,6,25,330,1085,18,5,370
7,27,Clear,11.26,142,1016.69,clear-night,436,7,16,939,1187,19,47,-188
8,35,Foggy,4.79,269,1018.02,fog,436,7,16,941,1181,19,41,-196
9,19,Clear,3.94,253,1020.12,clear-night,437,7,17,997,1195,19,55,-239
