In [1]:
%matplotlib inline
from PIL import Image
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import defaultdict

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics import fbeta_score, precision_score, make_scorer, average_precision_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
import cv2
import warnings
import mahotas as mh

In [4]:
n_samples = 5000
rescaled_dim = 50
df = pd.read_csv('/Users/mereltheisen/Kaggle/train.csv')
sample = pd.read_csv('/Users/mereltheisen/Dropbox/Kaggle/sample_submission.csv')
lb = MultiLabelBinarizer()

0                                          [haze, primary]
1                     [agriculture, clear, primary, water]
2                                         [clear, primary]
3                                         [clear, primary]
4          [agriculture, clear, habitation, primary, road]
5                                   [haze, primary, water]
6        [agriculture, clear, cultivation, primary, water]
7                                          [haze, primary]
8               [agriculture, clear, cultivation, primary]
9         [agriculture, clear, cultivation, primary, road]
10        [agriculture, clear, primary, slash_burn, water]
11                                 [clear, primary, water]
12                                                [cloudy]
13                                        [clear, primary]
14                                                [cloudy]
15                                        [clear, primary]
16                                        [clear, primar

In [5]:
def set_output(df):
    df['split_tags'] = df['tags'].map(lambda row: row.split(" "))
    y = lb.fit_transform(df['split_tags'])
    y = y[:n_samples]
    return y

def set_input(train_features):
    X = np.squeeze(np.array(train_features))
    X = MinMaxScaler().fit_transform(X)
    return X

def calculate_f2(y_test, predicted):
    score = fbeta_score(y_test, predicted, beta=2, average=None)
    avg_sample_score = fbeta_score(y_test, predicted, beta=2, average='samples')
    print('Average F2 test score {}'.format(avg_sample_score))
    print('F2 test scores per tag:')
    print [(lb.classes_[l], score[l]) for l in score.argsort()[::-1]]

In [57]:
# FEATURE EXTRACTION

# Raw features
raw_features_train = [cv2.resize(plt.imread('/Users/theisenm/Kaggle/train-jpg/{}.jpg'.format(name)), (rescaled_dim, rescaled_dim)).reshape(1, -1) for name in df.head(n_samples)['image_name'].values]
raw_features_test = [cv2.resize(plt.imread('/Users/theisenm/Kaggle/test-jpg/{}.jpg'.format(name)), (rescaled_dim, rescaled_dim)).reshape(1, -1) for name in sample['image_name'].values]

In [45]:
# Haralick features
haralick_features_train = [mh.features.haralick(
                cv2.resize(plt.imread('/Users/theisenm/Kaggle/train-jpg/{}.jpg'.format(name)),(rescaled_dim, rescaled_dim)),
                ignore_zeros=False, 
                preserve_haralick_bug=False, 
                compute_14th_feature=False)
         .reshape(1, -1) for name in df.head(n_samples)['image_name'].values]

#haralick_features_test = [mh.features.haralick(plt.imread('/Users/theisenm/Kaggle/test-jpg/{}.jpg'.format(name)), ignore_zeros=False, preserve_haralick_bug=False, compute_14th_feature=False).reshape(1, -1) for name in sample['image_name'].values]

In [58]:
# SET X's & Y's
y = set_output(df)
X = set_input(raw_features_train)
X_sub = set_input(raw_features_test)
#X = set_input(haralick_features_train)
#X_sub = set_input(haralick_features_test)

print(X.shape, y.shape, lb.classes_)
print X_sub.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

((5000, 10000), (5000, 17), array(['agriculture', 'artisinal_mine', 'bare_ground', 'blooming',
       'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation',
       'habitation', 'haze', 'partly_cloudy', 'primary', 'road',
       'selective_logging', 'slash_burn', 'water'], dtype=object))
(40669, 10000)


In [60]:
# LINEAR REGRESSION

clf = OneVsRestClassifier(LogisticRegression(C=10, penalty='l2'))
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    clf.fit(X_train, y_train)

# Calculate F2 score    
calculate_f2(y_test, clf.predict(X_test))

# Classify test set & prepare submission
y_sub = clf.predict(X_sub)

all_test_tags = []
for index in range(y_sub.shape[0]):
    all_test_tags.append(' '.join(list(lb.classes_[np.where(y_sub[index, :] == 1)[0]])))

sample['tags'] = all_test_tags
print sample.head()

Average F2 test score 0.674446374977
F2 test scores per tag:
[('primary', 0.94967007374822088), ('clear', 0.84690839026099141), ('cloudy', 0.60869565217391297), ('agriculture', 0.34178847807394663), ('road', 0.25499655410062028), ('partly_cloudy', 0.22959183673469385), ('water', 0.2209550962223806), ('bare_ground', 0.15432098765432098), ('habitation', 0.13066202090592333), ('cultivation', 0.074324324324324328), ('haze', 0.023771790808240881), ('slash_burn', 0.0), ('conventional_mine', 0.0), ('selective_logging', 0.0), ('blow_down', 0.0), ('blooming', 0.0), ('artisinal_mine', 0.0)]
  image_name                                   tags
0     test_0                          clear primary
1     test_1                          clear primary
2     test_2  habitation partly_cloudy primary road
3     test_3                     clear haze primary
4     test_4                habitation primary road


In [None]:
# Save output to CSV
sample.to_csv('ovr_f2_{}.csv'.format(avg_sample_score), index=False)

In [61]:
# MLP - SGD

mlp = MLPClassifier(solver='sgd', alpha=1e-3, hidden_layer_sizes=(20, 50, 20), random_state=1, max_iter=1000)
mlp.fit(X_train, y_train)

# Calculate F2 score 
calculate_f2(y_test, mlp.predict(X_test))

# Classify test set & prepare submission
#y_sub = mlp.predict(X_sub)

#all_test_tags = []
#for index in range(y_sub.shape[0]):
#    all_test_tags.append(' '.join(list(lb.classes_[np.where(y_sub[index, :] == 1)[0]])))

#sample['tags'] = all_test_tags
#print sample.head()

Average F2 test score 0.67907946449
F2 test scores per tag:
[('primary', 0.98829069619447607), ('clear', 0.91761265909872725), ('agriculture', 0.40730955526199908), ('road', 0.27896995708154509), ('cloudy', 0.19117647058823528), ('artisinal_mine', 0.0), ('bare_ground', 0.0), ('blooming', 0.0), ('blow_down', 0.0), ('water', 0.0), ('conventional_mine', 0.0), ('slash_burn', 0.0), ('habitation', 0.0), ('haze', 0.0), ('partly_cloudy', 0.0), ('selective_logging', 0.0), ('cultivation', 0.0)]


In [None]:
# Save output to CSV
sample.to_csv('mlp_f2_{}.csv'.format(avg_sample_score), index=False)

In [49]:
# MLP - lbfgs

mlpa = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(20, 50, 20), random_state=1, max_iter=1000)
mlpa.fit(X_train, y_train)

# Calculate F2 score 
calculate_f2(y_test, mlpa.predict(X_test))

# Classify test set & prepare submission
#y_sub = mlpa.predict(X_sub)

#all_test_tags = []
#for index in range(y_sub.shape[0]):
#    all_test_tags.append(' '.join(list(lb.classes_[np.where(y_sub[index, :] == 1)[0]])))

#sample['tags'] = all_test_tags
#print sample.head()

Average F2 test score 0.81684646498
F2 test scores per tag:
[('primary', 0.97987980141102715), ('clear', 0.94286219706350605), ('agriculture', 0.77613104524180965), ('cloudy', 0.75045207956600357), ('partly_cloudy', 0.67383512544802859), ('road', 0.61323618700667881), ('haze', 0.53542009884678754), ('water', 0.36098310291858676), ('habitation', 0.25297619047619047), ('cultivation', 0.015128593040847202), ('slash_burn', 0.0), ('conventional_mine', 0.0), ('selective_logging', 0.0), ('blow_down', 0.0), ('blooming', 0.0), ('bare_ground', 0.0), ('artisinal_mine', 0.0)]


In [38]:
# ENSEMBLE
ensemble = (clf.predict(X_sub) + mlp.predict(X_sub) + mlpa.predict(X_sub)) / 2
ensemble2 = (clf.predict(X_test) + mlp.predict(X_test) + mlpa.predict(X_test)) / 2
calculate_f2(y_test, ensemble2)

all_test_tags = []
for index in range(ensemble.shape[0]):
    all_test_tags.append(' '.join(list(lb.classes_[np.where(ensemble[index, :] == 1)[0]])))

sample['tags'] = all_test_tags
print sample.head()

Average F2 test score 0.729821284845
F2 test scores per tag:
[('primary', 0.98483879222108495), ('clear', 0.91188262390173158), ('agriculture', 0.52162328345972531), ('cloudy', 0.51045510455104559), ('road', 0.3513606613847744), ('partly_cloudy', 0.34575489819439115), ('haze', 0.27027027027027029), ('habitation', 0.027910685805422646), ('water', 0.016835016835016835), ('conventional_mine', 0.0), ('slash_burn', 0.0), ('selective_logging', 0.0), ('blow_down', 0.0), ('blooming', 0.0), ('bare_ground', 0.0), ('artisinal_mine', 0.0), ('cultivation', 0.0)]
  image_name                                    tags
0     test_0                           clear primary
1     test_1                           clear primary
2     test_2                                 primary
3     test_3                           clear primary
4     test_4  agriculture partly_cloudy primary road


In [50]:
ensemble2 = (clf.predict(X_test) + mlp.predict(X_test) + mlpa.predict(X_test)) / 2
calculate_f2(y_test, ensemble2)

Average F2 test score 0.788693823641
F2 test scores per tag:
[('primary', 0.98671355998436883), ('clear', 0.93002812939521806), ('agriculture', 0.74329652996845419), ('cloudy', 0.67717996289424853), ('road', 0.56768558951965065), ('partly_cloudy', 0.47799696509863426), ('haze', 0.44444444444444448), ('water', 0.22004889975550121), ('habitation', 0.21771771771771772), ('slash_burn', 0.0), ('conventional_mine', 0.0), ('selective_logging', 0.0), ('blow_down', 0.0), ('blooming', 0.0), ('bare_ground', 0.0), ('artisinal_mine', 0.0), ('cultivation', 0.0)]
