In [15]:
%matplotlib inline
from PIL import Image
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import defaultdict

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics import fbeta_score, precision_score, make_scorer, average_precision_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import cv2
import warnings
import mahotas as mh

In [17]:
n_samples = 5000
rescaled_dim = 32

path = '/Users/theisenm'
df = pd.read_csv(path + '/Dropbox/Kaggle/train_v2.csv')
sample = pd.read_csv(path + '/Dropbox/Kaggle/sample_submission_v2.csv')

lb = MultiLabelBinarizer()

In [18]:
def set_output(df):
    df['split_tags'] = df['tags'].map(lambda row: row.split(" "))
    y = lb.fit_transform(df['split_tags'])
    y = y[:n_samples]
    return y

def set_input(train_features):
    X = np.squeeze(np.array(train_features))
    X = MinMaxScaler().fit_transform(X)
    return X

def calculate_f2(y_test, predicted):
    score = fbeta_score(y_test, predicted, beta=2, average=None)
    avg_sample_score = fbeta_score(y_test, predicted, beta=2, average='samples')
    print('Average F2 test score {}'.format(avg_sample_score))
    print('F2 test scores per tag:')
    print [(lb.classes_[l], score[l]) for l in score.argsort()[::-1]]

In [19]:
# FEATURE EXTRACTION

# Raw features
raw_features_train = [cv2.resize(plt.imread(path + '/Kaggle/train-jpg/{}.jpg'.format(name)), (rescaled_dim, rescaled_dim)).reshape(1, -1) for name in df.head(n_samples)['image_name'].values]
raw_features_test = [cv2.resize(plt.imread(path + '/Kaggle/test-jpg/{}.jpg'.format(name)), (rescaled_dim, rescaled_dim)).reshape(1, -1) for name in sample['image_name'].values]

In [45]:
# Haralick features
haralick_features_train = [mh.features.haralick(
                cv2.resize(plt.imread('/Users/theisenm/Kaggle/train-jpg/{}.jpg'.format(name)),(rescaled_dim, rescaled_dim)),
                ignore_zeros=False, 
                preserve_haralick_bug=False, 
                compute_14th_feature=False)
         .reshape(1, -1) for name in df.head(n_samples)['image_name'].values]

#haralick_features_test = [mh.features.haralick(plt.imread('/Users/theisenm/Kaggle/test-jpg/{}.jpg'.format(name)), ignore_zeros=False, preserve_haralick_bug=False, compute_14th_feature=False).reshape(1, -1) for name in sample['image_name'].values]

In [20]:
# SET X's & Y's
y = set_output(df)
X = set_input(raw_features_train)
X_sub = set_input(raw_features_test)
#X = set_input(haralick_features_train)
#X_sub = set_input(haralick_features_test)

print(X.shape, y.shape, lb.classes_)
print X_sub.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)


Data with input dtype uint8 was converted to float64 by MinMaxScaler.



((5000, 4096), (5000, 17), array(['agriculture', 'artisinal_mine', 'bare_ground', 'blooming',
       'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation',
       'habitation', 'haze', 'partly_cloudy', 'primary', 'road',
       'selective_logging', 'slash_burn', 'water'], dtype=object))
(61191, 4096)


In [21]:
# LINEAR REGRESSION

clf = OneVsRestClassifier(LogisticRegression(C=10, penalty='l2'))
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    clf.fit(X_train, y_train)

# Calculate F2 score    
calculate_f2(y_test, clf.predict(X_test))

# Classify test set & prepare submission
y_sub = clf.predict(X_sub)

all_test_tags = []
for index in range(y_sub.shape[0]):
    all_test_tags.append(' '.join(list(lb.classes_[np.where(y_sub[index, :] == 1)[0]])))

sample['tags'] = all_test_tags
print sample.head()


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no true samples.


F-score is ill-defined and being set to 0.0 in samples with no predicted labels.



Average F2 test score 0.670601792821
F2 test scores per tag:
[('primary', 0.95782420181316508), ('clear', 0.8552052140214903), ('cloudy', 0.61475409836065564), ('agriculture', 0.36767760697964275), ('road', 0.26181353767560667), ('partly_cloudy', 0.22889842632331903), ('water', 0.2014652014652015), ('habitation', 0.17492711370262393), ('artisinal_mine', 0.11627906976744186), ('cultivation', 0.081453634085213042), ('bare_ground', 0.033112582781456956), ('slash_burn', 0.0), ('conventional_mine', 0.0), ('haze', 0.0), ('selective_logging', 0.0), ('blow_down', 0.0), ('blooming', 0.0)]
  image_name                      tags
0     test_0                   primary
1     test_1     partly_cloudy primary
2     test_2  clear haze primary water
3     test_3             clear primary
4     test_4  bare_ground cloudy water


In [24]:
# Save output to CSV
sample.to_csv('ovr_f2_0.67.csv', index=False)

In [25]:
# MLP - SGD

mlp = MLPClassifier(solver='sgd', alpha=1e-3, hidden_layer_sizes=(20, 50, 20), random_state=1, max_iter=1000)
mlp.fit(X_train, y_train)

# Calculate F2 score 
calculate_f2(y_test, mlp.predict(X_test))

# Classify test set & prepare submission
y_sub = mlp.predict(X_sub)

all_test_tags = []
for index in range(y_sub.shape[0]):
    all_test_tags.append(' '.join(list(lb.classes_[np.where(y_sub[index, :] == 1)[0]])))

sample['tags'] = all_test_tags
print sample.head()

Average F2 test score 0.674997173453
F2 test scores per tag:
[('primary', 0.98335067637877205), ('clear', 0.91012018812053641), ('agriculture', 0.42238421955403088), ('cloudy', 0.41079812206572769), ('road', 0.22237196765498651), ('partly_cloudy', 0.044247787610619461), ('artisinal_mine', 0.0), ('bare_ground', 0.0), ('blooming', 0.0), ('blow_down', 0.0), ('water', 0.0), ('conventional_mine', 0.0), ('slash_burn', 0.0), ('habitation', 0.0), ('haze', 0.0), ('selective_logging', 0.0), ('cultivation', 0.0)]
  image_name           tags
0     test_0  clear primary
1     test_1  clear primary
2     test_2        primary
3     test_3  clear primary
4     test_4        primary


In [26]:
# Save output to CSV
sample.to_csv('mlp_f2_0.675.csv', index=False)

In [27]:
# MLP - lbfgs

mlpa = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(20, 50, 20), random_state=1, max_iter=1000)
mlpa.fit(X_train, y_train)

# Calculate F2 score 
calculate_f2(y_test, mlpa.predict(X_test))

# Classify test set & prepare submission
y_sub = mlpa.predict(X_sub)

all_test_tags = []
for index in range(y_sub.shape[0]):
    all_test_tags.append(' '.join(list(lb.classes_[np.where(y_sub[index, :] == 1)[0]])))

sample['tags'] = all_test_tags
print sample.head()

Average F2 test score 0.68390101209
F2 test scores per tag:
[('primary', 0.96037785358173722), ('clear', 0.81992131616595143), ('cloudy', 0.54766734279918872), ('haze', 0.43513957307060758), ('agriculture', 0.41183774834437081), ('partly_cloudy', 0.37812288993923027), ('road', 0.29374999999999996), ('habitation', 0.14471780028943562), ('cultivation', 0.05992010652463383), ('water', 0.016778523489932883), ('conventional_mine', 0.0), ('slash_burn', 0.0), ('selective_logging', 0.0), ('blow_down', 0.0), ('blooming', 0.0), ('bare_ground', 0.0), ('artisinal_mine', 0.0)]
  image_name                                       tags
0     test_0                      partly_cloudy primary
1     test_1                              clear primary
2     test_2  agriculture clear habitation primary road
3     test_3                              clear primary
4     test_4             agriculture clear primary road


In [28]:
# ENSEMBLE
ensemble = (clf.predict(X_sub) + mlp.predict(X_sub) + mlpa.predict(X_sub)) / 2
ensemble2 = (clf.predict(X_test) + mlp.predict(X_test) + mlpa.predict(X_test)) / 2
calculate_f2(y_test, ensemble2)

all_test_tags = []
for index in range(ensemble.shape[0]):
    all_test_tags.append(' '.join(list(lb.classes_[np.where(ensemble[index, :] == 1)[0]])))

sample['tags'] = all_test_tags
print sample.head()

Average F2 test score 0.676024611776
F2 test scores per tag:
[('primary', 0.97320261437908506), ('clear', 0.87858021437357237), ('cloudy', 0.57483731019522777), ('agriculture', 0.3963153384747215), ('road', 0.21433355659745482), ('partly_cloudy', 0.16399694889397406), ('habitation', 0.039808917197452227), ('cultivation', 0.0070721357850070726), ('water', 0.0042122999157540014), ('slash_burn', 0.0), ('conventional_mine', 0.0), ('haze', 0.0), ('selective_logging', 0.0), ('blow_down', 0.0), ('blooming', 0.0), ('bare_ground', 0.0), ('artisinal_mine', 0.0)]
  image_name           tags
0     test_0        primary
1     test_1  clear primary
2     test_2  clear primary
3     test_3  clear primary
4     test_4        primary


In [50]:
ensemble2 = (clf.predict(X_test) + mlp.predict(X_test) + mlpa.predict(X_test)) / 2
calculate_f2(y_test, ensemble2)

Average F2 test score 0.788693823641
F2 test scores per tag:
[('primary', 0.98671355998436883), ('clear', 0.93002812939521806), ('agriculture', 0.74329652996845419), ('cloudy', 0.67717996289424853), ('road', 0.56768558951965065), ('partly_cloudy', 0.47799696509863426), ('haze', 0.44444444444444448), ('water', 0.22004889975550121), ('habitation', 0.21771771771771772), ('slash_burn', 0.0), ('conventional_mine', 0.0), ('selective_logging', 0.0), ('blow_down', 0.0), ('blooming', 0.0), ('bare_ground', 0.0), ('artisinal_mine', 0.0), ('cultivation', 0.0)]
