In [2]:
import pandas as pd

%matplotlib inline

# Load a common image analysis dataset

In [47]:
train = pd.read_csv("image_train.csv", index_col=0)
test = pd.read_csv("image_test.csv", index_col=0)

## Convert ~turi-create~ dataset into more applicable dataset

In [144]:
def extract_features(dataframe: pd.DataFrame, column: str):
    raw_data = [
        re.search(r"\[(.*)\]", col).group(1)
        for col in dataframe.loc[:, column]
    ]
    normalized_data = [
        list(map(float, row.split()))
        for row in raw_data
    ]
    return pd.DataFrame(normalized_data, index=dataframe.index).add_prefix(column)


In [163]:
# fix the train dataset
image_array_df = extract_features(train, 'image_array')

deep_features_df = extract_features(train, 'deep_features')

train = pd.concat(
    (
    train.drop(['deep_features', 'image_array'], axis=1),
    image_array_df,
    deep_features_df
    )
    , axis='columns'
)

In [164]:
# fix the test dataset
image_array_df = extract_features(test, 'image_array')

deep_features_df = extract_features(test, 'deep_features')

test = pd.concat(
    (
    test.drop(['deep_features', 'image_array'], axis=1),
    image_array_df,
    deep_features_df
    )
    , axis='columns'
)

In [225]:
# convert label to category
train['label'] = train.label.astype('category')
test['label'] = test.label.astype('category')

In [167]:
train.head()

Unnamed: 0_level_0,image,label,image_array0,image_array1,image_array2,image_array3,image_array4,image_array5,image_array6,image_array7,...,deep_features4086,deep_features4087,deep_features4088,deep_features4089,deep_features4090,deep_features4091,deep_features4092,deep_features4093,deep_features4094,deep_features4095
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,Height: 32 Width: 32,bird,73.0,77.0,58.0,71.0,68.0,50.0,77.0,69.0,...,0.0,0.0,0.421911,0.90445,1.10052,0.0,1.31538,0.0,0.0,0.0
33,Height: 32 Width: 32,cat,7.0,5.0,8.0,7.0,5.0,8.0,5.0,4.0,...,1.14279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50845
36,Height: 32 Width: 32,cat,169.0,122.0,65.0,131.0,108.0,75.0,193.0,196.0,...,0.0,0.445423,0.0,1.38121,0.0,0.0,0.0,0.0,0.0,0.0
70,Height: 32 Width: 32,dog,154.0,179.0,152.0,159.0,183.0,157.0,165.0,189.0,...,0.188081,0.0,0.0,2.0697,0.0,0.0,1.6333,0.0,0.0,0.0
90,Height: 32 Width: 32,bird,216.0,195.0,180.0,201.0,178.0,160.0,210.0,184.0,...,1.44738,0.198865,0.0,1.67262,0.0,0.0,0.0,0.0,0.0,0.0


In [227]:
simple_feature_cols = [col for col in train.columns if col.startswith('image_array')]
X = train[simple_feature_cols]
y = train.label

# Exploring the image data

# Train a classifier on the raw image pixels

In [238]:
from sklearn.linear_model import LogisticRegression

raw_pixel_model = LogisticRegression(solver='lbfgs', max_iter=500)

raw_pixel_model.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=500)

# Make a prediction with th esimple model based on raw pixels

In [239]:
test.head(3)['label']

id
0           cat
6    automobile
8           cat
Name: label, dtype: category
Categories (4, object): ['automobile', 'bird', 'cat', 'dog']

In [240]:
logreg.predict(test.head(3)[simple_feature_cols])

array(['dog', 'cat', 'bird'], dtype=object)

# Evaluating raw pixel model on test data

In [241]:
from sklearn.metrics import accuracy_score

y_pred = logreg.predict(test[simple_feature_cols])

In [242]:
accuracy_score(test.label, y_pred)

0.42525

# Can we improve the model using deep features

In [243]:
deep_learning_features = [
    col
    for col in train.columns
    if col.startswith('deep_features')
]

X = train[deep_learning_features]
y = train.label

# Given the deep features, let's train classifier

In [247]:
deep_learning_model = LogisticRegression(solver="lbfgs", max_iter=500)

deep_learning_model.fit(X, y)

## Apply the deep features model to first few images of test data

In [253]:
test[0:3]['label']

id
0           cat
6    automobile
8           cat
Name: label, dtype: category
Categories (4, object): ['automobile', 'bird', 'cat', 'dog']

In [254]:
deep_learning_model.predict(test[0:3][deep_learning_features])

array(['cat', 'automobile', 'cat'], dtype=object)

## Compute test_data accuracy of deep_features_model

In [256]:
y_pred = deep_learning_model.predict(test[deep_learning_features])

In [257]:
accuracy_score(test.label, y_pred)

0.79875