In [1]:
# based on https://www.kaggle.com/michaelkang/predicting-identity-of-pokemon-given-features

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('../datasets/pokemonGo/300k.csv', low_memory=False)

In [3]:
data[['city','latitude', 'longitude', 'appearedLocalTime']].head(5)

Unnamed: 0,city,latitude,longitude,appearedLocalTime
0,Mexico_City,20.525745,-97.460829,2016-09-08T03:57:45
1,Mexico_City,20.523695,-97.461167,2016-09-08T03:57:37
2,New_York,38.90359,-77.19978,2016-09-08T03:57:25
3,Los_Angeles,47.665903,-122.312561,2016-09-08T03:56:22
4,Los_Angeles,47.666454,-122.311628,2016-09-08T03:56:08


In [4]:
egenskapliste = list(data)
print egenskapliste

['pokemonId', 'latitude', 'longitude', 'appearedLocalTime', '_id', 'cellId_90m', 'cellId_180m', 'cellId_370m', 'cellId_730m', 'cellId_1460m', 'cellId_2920m', 'cellId_5850m', 'appearedTimeOfDay', 'appearedHour', 'appearedMinute', 'appearedDayOfWeek', 'appearedDay', 'appearedMonth', 'appearedYear', 'terrainType', 'closeToWater', 'city', 'continent', 'weather', 'temperature', 'windSpeed', 'windBearing', 'pressure', 'weatherIcon', 'sunriseMinutesMidnight', 'sunriseHour', 'sunriseMinute', 'sunriseMinutesSince', 'sunsetMinutesMidnight', 'sunsetHour', 'sunsetMinute', 'sunsetMinutesBefore', 'population_density', 'urban', 'suburban', 'midurban', 'rural', 'gymDistanceKm', 'gymIn100m', 'gymIn250m', 'gymIn500m', 'gymIn1000m', 'gymIn2500m', 'gymIn5000m', 'pokestopDistanceKm', 'pokestopIn100m', 'pokestopIn250m', 'pokestopIn500m', 'pokestopIn1000m', 'pokestopIn2500m', 'pokestopIn5000m', 'cooc_1', 'cooc_2', 'cooc_3', 'cooc_4', 'cooc_5', 'cooc_6', 'cooc_7', 'cooc_8', 'cooc_9', 'cooc_10', 'cooc_11', 'co

In [5]:
train = data
train = train.drop(['_id', 'cellId_90m', 'cellId_180m', 'cellId_370m', 'cellId_730m', 'cellId_1460m', 'cellId_2920m', 'cellId_5850m'],1)
train = train.drop(['gymIn100m', 'gymIn250m', 'gymIn500m', 'gymIn1000m', 'gymIn2500m', 'gymIn5000m', 'pokestopIn100m', 'pokestopIn250m', 'pokestopIn500m', 'pokestopIn1000m', 'pokestopIn2500m', 'pokestopIn5000m'],1)
train = train.drop(['appearedDayOfWeek'],1)

In [6]:
#Noticed that the appeared Hour/Minute/Day/Month/Year weren't consisted with appearedLocalTime. Removed them all in favor of appearedLocalTime
train = train.drop(['appearedHour', 'appearedMinute', 'appearedDay', 'appearedMonth', 'appearedYear'],1)
#Convert appearedLocalTime string to DateTime
train['appearedLocalTime'] =  pd.to_datetime(train['appearedLocalTime'], format='%Y-%m-%dT%H:%M:%S')        #Note that %y is a 2digit, while %Y is 4digits for the year
#Now reinstate the appeared Hour/Minute/Day/Month/Year, then drop appearedLocalTime
train['appearedHour'] = train['appearedLocalTime'].dt.hour
train['appearedMinute'] = train['appearedLocalTime'].dt.minute
train['appearedDay'] = train['appearedLocalTime'].dt.day
train['appearedMonth'] = train['appearedLocalTime'].dt.month
train['appearedYear'] = train['appearedLocalTime'].dt.year
train = train.drop(['appearedLocalTime'],1)
#Now use 1-of-K encoding using pd.get_dummies()
Hour = pd.get_dummies(train.appearedHour, drop_first=True, prefix='hour')
Minute = pd.get_dummies(train.appearedMinute, drop_first=True, prefix='minute')
Day = pd.get_dummies(train.appearedDay, drop_first=True, prefix='day')
Month = pd.get_dummies(train.appearedMonth, drop_first=True, prefix='month')
Year = pd.get_dummies(train.appearedYear, drop_first=True, prefix='year')
train = train.join(Hour)         #To avoid dummy variable trap
train = train.join(Minute)
train = train.join(Day)
train = train.join(Month)
train = train.join(Year)
#Now we drop the appearedTimeX feature
train = train.drop(['appearedHour', 'appearedMinute', 'appearedDay', 'appearedMonth', 'appearedYear'],1)

In [7]:
#Converting appearedTimeofDay into ordinal
time_mapping = {"morning": 0, "afternoon": 1, "evening": 2, "night": 3}
train['appearedTimeOfDay'] = train['appearedTimeOfDay'].map(time_mapping)

In [8]:
#Same for terrainType
Terr = pd.get_dummies(train.terrainType, drop_first=True, prefix='terr')
#train = train.join(Terr)         #To avoid dummy variable trap
#Now we drop the terrain feature
#train = train.drop(['terrainType'],1)

In [9]:
#Get dummies on cities
City = pd.get_dummies(train.city, drop_first=True, prefix='city')
train = train.join(City)         #To avoid dummy variable trap
#Now we drop the city feature
train = train.drop(['city'],1)

In [10]:
#redefining continents such that they correspond to the main 7 continents (no Antartica, yes Indian)
train.continent[train['continent']=='America/Indiana']='America'
train.continent[train['continent']=='America/Kentucky']='America'
train.continent[train['continent']=='Pacific']='Australia'
train.continent[train['continent']=='Atlantic']='Europe'
train.continent[train['continent']=='America/Argentina']='CentralAmerica'
#Then change them to dummies
Continent = pd.get_dummies(train.continent, drop_first=True, prefix='continent')
train = train.join(Continent)         #To avoid dummy variable trap
#Now we drop the continent feature
train = train.drop(['continent'],1)

In [11]:
#Comparing weather columns and choosing to drop weatherIcon. Then use dummies for weather
train['weather'].value_counts()
train['weatherIcon'].value_counts()             #These weather icons are based on time of day as well, making me inclined to not use them.
Weather = pd.get_dummies(train.weather, drop_first=True, prefix='weather')
train = train.join(Weather)         #To avoid dummy variable trap
#Now we drop both weather features
train = train.drop(['weatherIcon', 'weather'],1)

In [12]:
#Want to band windBearing into the 8 cardinal directions. (Probably used azimuth degrees where blowing north is 0 degrees and blowing west is 90 degrees)
#We define North as 0, NW as 1, W as 2, etc...
train.loc[(train['windBearing'] >= 337.5), 'windBearing'] = 0
train.loc[(train['windBearing'] < 22.5), 'windBearing'] = 0
train.loc[(train['windBearing'] >= 22.5) & (train['windBearing'] < 67.5), 'windBearing'] = 1
train.loc[(train['windBearing'] >= 67.5) & (train['windBearing'] < 112.5), 'windBearing'] = 2
train.loc[(train['windBearing'] >= 112.5) & (train['windBearing'] < 157.5), 'windBearing'] = 3
train.loc[(train['windBearing'] >= 157.5) & (train['windBearing'] < 202.5), 'windBearing'] = 4
train.loc[(train['windBearing'] >= 202.5) & (train['windBearing'] < 247.5), 'windBearing'] = 5
train.loc[(train['windBearing'] >= 247.5) & (train['windBearing'] < 292.5), 'windBearing'] = 6
train.loc[(train['windBearing'] >= 292.5) & (train['windBearing'] < 337.5), 'windBearing'] = 7
#Now make them into dummies
WindBearing = pd.get_dummies(train.windBearing, drop_first=True, prefix='windBearing')
train = train.join(WindBearing)         #To avoid dummy variable trap
#Now we drop the wind direction feature
train = train.drop(['windBearing'],1)

In [13]:
#Some quick functions for converting minutes for sunrise/sunset minute standardization
def OnlyPositiveTime(x):
    if x<0:
        return x+1440                   #Where 1440 = minutes per day
    else:
        return x
    
def OnlyNegativeTime(x):
    if x>0:
        return x-1440                   #Where 1440 = minutes per day
    else:
        return x

In [14]:
#Turned Sunrise/set Hour & Minute into dummies. Made sure that minutes since midnight for sunrise/set is positive (no negative minutes)
SunriseHour = pd.get_dummies(train.sunriseHour, drop_first=True, prefix='sunriseHour')
SunriseMinute = pd.get_dummies(train.sunriseMinute, drop_first=True, prefix='sunriseMinute')
SunsetHour = pd.get_dummies(train.sunsetHour, drop_first=True, prefix='sunsetHour')
SunsetMinute = pd.get_dummies(train.sunsetMinute, drop_first=True, prefix='sunsetMinute')
train = train.join(SunriseHour)         #To avoid dummy variable trap
train = train.join(SunriseMinute)
train = train.join(SunsetHour)
train = train.join(SunsetMinute)
#Now we drop the sunrise/set time features
train = train.drop(['sunriseHour', 'sunriseMinute', 'sunsetHour', 'sunsetMinute'],1)
train['sunriseMinutesMidnight'].apply(OnlyPositiveTime)
train['sunsetMinutesMidnight'].apply(OnlyPositiveTime)
#Make sure that each sighting's minutes since sunrise (sunriseMinutesSince) is positive & that sunsetMinutesBefore is negative
train['sunriseMinutesSince'].apply(OnlyPositiveTime)
train['sunsetMinutesBefore'].apply(OnlyNegativeTime)

0         -196
1         -196
2         -266
3          -77
4          -77
5        -1070
6        -1070
7         -188
8         -196
9         -239
10        -239
11        -239
12        -188
13        -196
14        -188
15         -77
16        -196
17        -101
18        -196
19       -1070
20        -249
21        -188
22        -196
23        -105
24        -196
25        -188
26         -77
27        -105
28        -631
29        -196
          ... 
295991   -1265
295992   -1265
295993    -246
295994    -246
295995    -251
295996   -1167
295997   -1310
295998   -1294
295999   -1145
296000   -1335
296001    -231
296002   -1356
296003    -191
296004    -251
296005   -1335
296006   -1310
296007    -252
296008    -227
296009   -1321
296010   -1294
296011   -1278
296012   -1335
296013    -183
296014    -183
296015    -234
296016   -1337
296017    -227
296018   -1286
296019   -1265
296020    -251
Name: sunsetMinutesBefore, dtype: int64

In [15]:
#Change urban-suburban-urban into numeric values. 0=urban, 1=midurban, 2=suburban, 3=rural
#Dropping suburban and midurban columns, since they dont seem to be accurate. A sighting can't be both urban, suburban, and midurban if they are partitioned bands of population density
#Instead banding to get the urban, suburban, midurban, rural categorization, then changing to ordinal
train = train.drop(['urban', 'suburban', 'midurban', 'rural'],1)
train.loc[train['population_density'] < 200, 'population_density'] = 0
train.loc[(train['population_density'] >= 200) & (train['population_density'] < 400), 'population_density'] = 1
train.loc[(train['population_density'] >= 400) & (train['population_density'] < 800), 'population_density'] = 2
train.loc[train['population_density'] > 800, 'population_density'] = 3
#Just changing the name to show that I processed
train.rename(columns={'population_density' : 'Urbanity'}, inplace = True)

In [16]:
#Changing pokestopDistanceKm from a str to a float
PokestopDistance = pd.to_numeric(train['pokestopDistanceKm'], errors='coerce')
temporary = pd.concat([train, PokestopDistance], axis=1)
#This ends up dropping 39 instances. I'll find out what is causing the NaN's later (Note: errors='coerce' made them NaN's)
train = temporary.dropna()

In [17]:
#Making sure that pokemonID (the first column)) and class (the last column) are the same
row_ids = train[train['class'] != train.pokemonId].index        #This yields an empty set --> identical columns
#So now drop one of them and keep the other (for now) to use as the labels
train.drop(['class'],1)

Unnamed: 0,pokemonId,latitude,longitude,appearedTimeOfDay,terrainType,closeToWater,temperature,windSpeed,pressure,sunriseMinutesMidnight,...,sunsetMinute_51,sunsetMinute_52,sunsetMinute_53,sunsetMinute_54,sunsetMinute_55,sunsetMinute_56,sunsetMinute_57,sunsetMinute_58,sunsetMinute_59,pokestopDistanceKm
0,16,20.525745,-97.460829,3,14,False,25.5,4.79,1018.02,436,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081776
1,133,20.523695,-97.461167,3,14,False,25.5,4.79,1018.02,436,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195622
2,16,38.903590,-77.199780,3,13,False,24.2,4.29,1015.29,404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338602
3,13,47.665903,-122.312561,3,0,True,15.6,5.84,1020.52,398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109479
4,133,47.666454,-122.311628,3,0,True,15.6,5.84,1020.52,398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040364
5,21,-31.954980,115.853609,3,13,False,16.5,6.39,1024.44,385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011718
6,66,-31.954245,115.852038,3,13,False,16.5,6.40,1024.45,385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040814
7,27,26.235257,-98.197591,3,13,False,28.0,11.26,1016.69,436,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.958176
8,35,20.525554,-97.458800,3,14,False,25.5,4.79,1018.02,436,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116773
9,19,32.928558,-84.340278,3,8,False,23.7,3.94,1020.12,437,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.300859


In [18]:
train = train.drop(['pokemonId'],1)
train.to_csv('../datasets/pokemonGo/processed_pokemon_go.csv')

features = train.drop(['class'],1).as_matrix()

labels = train['class'].as_matrix()
feature_names = list( train.drop(['class'],1))
print feature_names
print "Number of features: %d" %len(feature_names)

['latitude', 'longitude', 'appearedTimeOfDay', 'terrainType', 'closeToWater', 'temperature', 'windSpeed', 'pressure', 'sunriseMinutesMidnight', 'sunriseMinutesSince', 'sunsetMinutesMidnight', 'sunsetMinutesBefore', 'Urbanity', 'gymDistanceKm', 'pokestopDistanceKm', 'cooc_1', 'cooc_2', 'cooc_3', 'cooc_4', 'cooc_5', 'cooc_6', 'cooc_7', 'cooc_8', 'cooc_9', 'cooc_10', 'cooc_11', 'cooc_12', 'cooc_13', 'cooc_14', 'cooc_15', 'cooc_16', 'cooc_17', 'cooc_18', 'cooc_19', 'cooc_20', 'cooc_21', 'cooc_22', 'cooc_23', 'cooc_24', 'cooc_25', 'cooc_26', 'cooc_27', 'cooc_28', 'cooc_29', 'cooc_30', 'cooc_31', 'cooc_32', 'cooc_33', 'cooc_34', 'cooc_35', 'cooc_36', 'cooc_37', 'cooc_38', 'cooc_39', 'cooc_40', 'cooc_41', 'cooc_42', 'cooc_43', 'cooc_44', 'cooc_45', 'cooc_46', 'cooc_47', 'cooc_48', 'cooc_49', 'cooc_50', 'cooc_51', 'cooc_52', 'cooc_53', 'cooc_54', 'cooc_55', 'cooc_56', 'cooc_57', 'cooc_58', 'cooc_59', 'cooc_60', 'cooc_61', 'cooc_62', 'cooc_63', 'cooc_64', 'cooc_65', 'cooc_66', 'cooc_67', 'cooc_

In [None]:
#features = features[:,:10]

N = len(labels)

test_N = 0.5*N
train_N = N-test_N

idx = np.random.permutation(N)
train_idx = idx[:train_N]
test_idx = idx[train_N:]

train_data   = features[train_idx]
train_labels = labels[train_idx]

test_data   = features[test_idx]
test_labels = labels[test_idx]

In [None]:
from sklearn.svm import SVC, LinearSVC


clf = SVC()
clf.fit(train_data,train_labels)
predictions = clf.predict(test_data)
print "predicted labels"
print predictions
print "true labels"
print test_labels

print "Accuracy score: %f" %(predictions==test_labels).mean()


In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(train_data,train_labels)

In [None]:
predictions = clf.predict(test_data)
print "predicted labels"
print predictions
print "true labels"
print test_labels

print "Accuracy score: %f" %(predictions==test_labels).mean()

feature importance

In [None]:
print clf.feature_importances_

In [None]:
importances = clf.feature_importances_

indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(train_data.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(train_data.shape[1]), importances[indices],
       color="r", align="center")
plt.xticks(range(train_data.shape[1]), indices)
plt.xlim([-1, train_data.shape[1]])
plt.show()

In [None]:
most_important_feature_idx = np.argmax(importances)
print most_important_feature_idx
most_important_feature = feature_names[most_important_feature_idx]
print most_important_feature