In [46]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ghouls-goblins-and-ghosts-boo/train.csv.zip
/kaggle/input/ghouls-goblins-and-ghosts-boo/test.csv.zip
/kaggle/input/ghouls-goblins-and-ghosts-boo/sample_submission.csv.zip


# Load data

In [47]:
train_data = pd.read_csv('/kaggle/input/ghouls-goblins-and-ghosts-boo/train.csv.zip')
test_data = pd.read_csv('/kaggle/input/ghouls-goblins-and-ghosts-boo/test.csv.zip')

# Analyze data

**Describe data**

* look at columns
* check shape
* check null columns

In [48]:
print("Colums: ", train_data.columns.values)
print("Shape: ", train_data.shape)

Colums:  ['id' 'bone_length' 'rotting_flesh' 'hair_length' 'has_soul' 'color'
 'type']
Shape:  (371, 7)


In [49]:
print("Missing values:")
print(train_data.isnull().sum())

Missing values:
id               0
bone_length      0
rotting_flesh    0
hair_length      0
has_soul         0
color            0
type             0
dtype: int64


**Cool, there are no missing values =)**

# All colors

In [50]:
train_data.color.unique()

array(['clear', 'green', 'black', 'white', 'blue', 'blood'], dtype=object)

# Feature Engineering

* Categorical: 'color'
* Numerical: 'bone_length' 'rotting_flesh' 'hair_length'


Transform color column to binary columns

In [51]:
test_data = pd.concat([test_data,
                pd.get_dummies(test_data.color, prefix="color", drop_first = True)
                 ], axis=1)
train_data = pd.concat([train_data,
                pd.get_dummies(train_data.color, prefix="color", drop_first = True)
                 ], axis=1)

In [52]:
test_id = test_data['id'].copy()
test_data.drop(['color','id'], axis=1, inplace=True)
train_data.drop(['color','id'], axis=1, inplace=True)

In [53]:
print("Colums: ", train_data.columns.values)

Colums:  ['bone_length' 'rotting_flesh' 'hair_length' 'has_soul' 'type'
 'color_blood' 'color_blue' 'color_clear' 'color_green' 'color_white']


# Init MLP

In [54]:
y=train_data['type']
X=train_data.copy()
del X['type']
print(X)
np.shape(X)

     bone_length  rotting_flesh  hair_length  has_soul  color_blood  \
0       0.354512       0.350839     0.465761  0.781142            0   
1       0.575560       0.425868     0.531401  0.439899            0   
2       0.467875       0.354330     0.811616  0.791225            0   
3       0.776652       0.508723     0.636766  0.884464            0   
4       0.566117       0.875862     0.418594  0.636438            0   
..           ...            ...          ...       ...          ...   
366     0.458132       0.391760     0.660590  0.635689            0   
367     0.331936       0.564836     0.539216  0.551471            0   
368     0.481640       0.501147     0.496446  0.544003            0   
369     0.294943       0.771286     0.583503  0.300618            0   
370     0.670200       0.768469     0.737274  0.608384            0   

     color_blue  color_clear  color_green  color_white  
0             0            1            0            0  
1             0            0     

(371, 9)

# Optimize labels

In [55]:
y.unique()
my_map = {'Ghoul': 1, 'Goblin': 2, 'Ghost': 3}
inv_map = {1: 'Ghoul', 2: 'Goblin', 3: 'Ghost'}
y = y.map(my_map)
print(y)


0      1
1      2
2      1
3      1
4      3
      ..
366    2
367    3
368    1
369    3
370    1
Name: type, Length: 371, dtype: int64


In [56]:
print(X)

     bone_length  rotting_flesh  hair_length  has_soul  color_blood  \
0       0.354512       0.350839     0.465761  0.781142            0   
1       0.575560       0.425868     0.531401  0.439899            0   
2       0.467875       0.354330     0.811616  0.791225            0   
3       0.776652       0.508723     0.636766  0.884464            0   
4       0.566117       0.875862     0.418594  0.636438            0   
..           ...            ...          ...       ...          ...   
366     0.458132       0.391760     0.660590  0.635689            0   
367     0.331936       0.564836     0.539216  0.551471            0   
368     0.481640       0.501147     0.496446  0.544003            0   
369     0.294943       0.771286     0.583503  0.300618            0   
370     0.670200       0.768469     0.737274  0.608384            0   

     color_blue  color_clear  color_green  color_white  
0             0            1            0            0  
1             0            0     

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [58]:
from sklearn.preprocessing import normalize
X_train_norm=normalize(X_train)
X_test_norm=normalize(X_test)

# GradientBoostingClassifier

In [59]:
from sklearn.ensemble import GradientBoostingClassifier



clf = GradientBoostingClassifier(learning_rate=0.1,
                                 n_estimators=700,
                                 max_depth=2)

clf.fit(X_train_norm, y_train)
print("RF Accuracy: " + repr(round(clf.score(X_test_norm, y_test) * 100, 2)) + "%")


RF Accuracy: 66.67%


# RandomForestClassifier

In [60]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(criterion='entropy',
                             n_estimators=700,
                             min_samples_split=5,
                             min_samples_leaf=1,
                             max_features = "auto",
                             oob_score=True,
                             random_state=0,
                             n_jobs=-1)

clf.fit(X_train_norm, y_train)
print("RF Accuracy: " + repr(round(clf.score(X_test_norm, y_test) * 100, 2)) + "%")

RF Accuracy: 69.33%


# MLPClassifier

In [61]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
clf=MLPClassifier(solver='adam',hidden_layer_sizes=350, alpha=1e-04, max_iter =120000)
clf.fit(X_train_norm,y_train)

preds=pd.Series(clf.predict(X_test_norm))
print(accuracy_score(y_test,preds))

0.64


# LogisticRegression [BEST]

In [62]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1000000, max_iter=120000)
clf.fit(X_train_norm,y_train)
preds=pd.Series(clf.predict(X_test_norm))
print(accuracy_score(y_test,preds))

0.68


# KNeighborsClassifier

In [63]:
from sklearn.neighbors import KNeighborsClassifier

clf= KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train_norm,y_train)
preds=pd.Series(clf.predict(X_test_norm))
print(accuracy_score(y_test,preds))


0.68


In [64]:
X_pred=test_data
print(X_pred)

     bone_length  rotting_flesh  hair_length  has_soul  color_blood  \
0       0.471774       0.387937     0.706087  0.698537            0   
1       0.427332       0.645024     0.565558  0.451462            0   
2       0.549602       0.491931     0.660387  0.449809            0   
3       0.638095       0.682867     0.471409  0.356924            0   
4       0.361762       0.583997     0.377256  0.276364            0   
..           ...            ...          ...       ...          ...   
524     0.377573       0.390158     0.696465  0.355373            0   
525     0.229161       0.601265     0.191282  0.475115            0   
526     0.510497       0.498347     0.708020  0.714154            0   
527     0.331472       0.765835     0.338207  0.193431            0   
528     0.256789       0.691802     0.415197  0.348971            0   

     color_blue  color_clear  color_green  color_white  
0             0            0            0            0  
1             0            0     

In [65]:
from sklearn.preprocessing import normalize

result = pd.Series(clf.predict(normalize(X_pred)), name='type')
result = result.map(inv_map)
result = pd.concat([test_id,result], axis=1)
df=pd.DataFrame(result)
df.index+=1
print(result.shape)
filename = 'Prediction.csv'
df.to_csv(filename,index=False)
print('Saved file: ' + filename)

(529, 2)
Saved file: Prediction.csv
