In [1]:
import cStringIO
import numpy as np
import os
from PIL import Image
from sklearn import cross_validation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.pipeline import make_pipeline

IMAGE_SIZE = 128, 128
IMAGE_ARRAY_SIZE = IMAGE_SIZE[0] * IMAGE_SIZE[1]

def ela_from_image(file_path, quality=95):
    source_image = Image.open(file_path)
    string_io = cStringIO.StringIO()
    source_image = source_image.resize(IMAGE_SIZE)
    source_image.save(string_io, 'JPEG', quality=quality)
    output_image_io = Image.open(string_io)
    output_image = Image.new('RGB', source_image.size)
    output_image.putdata(map(generator.calculate,
                             source_image.getdata(),
                             output_image_io.getdata()))
    output_image = output_image.convert('L')
    return np.r_[output_image].reshape(-1)

class ELAGenerator():
    def __init__(self, trigger=10, enhance=20, coloronly=False):
        self.trigger = trigger
        self.enhance = enhance
        self.coloronly = coloronly

    def calculate(self, pixelA, pixelB):
        pixelDiff = map(lambda x, y: abs(x - y), pixelA, pixelB)
        if sum(pixelDiff) > self.trigger and (not self.coloronly or pixelDiff[0] != pixelDiff[1] or pixelDiff[0] != pixelDiff[2]):
            return tuple([x * self.enhance for x in pixelDiff])
        else:
            return (0, 0, 0)



np.random.seed(0)
generator = ELAGenerator(trigger=1, enhance=50)

## Load images

In [2]:
def image_files_in_folder(folder):
    all_files = os.listdir(folder)
    path_list = []
    for file in all_files:
        if file.endswith('.jpg'):
            path_list.append('%s/%s' % (folder, file))
    return path_list



psed_file_names = image_files_in_folder('../data/psed-by-email')[:100]
nonpsed_file_names = image_files_in_folder('../data/non-psed')[:100]
X = np.concatenate((psed_file_names,
                    nonpsed_file_names))
y = np.concatenate((np.repeat(1, len(psed_file_names)),
                    np.repeat(0, len(nonpsed_file_names))))

X_train, X_test, y_train, y_test = \
    cross_validation.train_test_split(X, y, random_state=0, train_size=.8)

## Pipeline

In [3]:
ela_from_image(X_train[124])

array([ 76,  50,  44, ..., 134, 243, 225], dtype=uint8)

In [4]:
class ELATransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        ela_X = np.empty((len(X), IMAGE_ARRAY_SIZE))
        for index, file_name in enumerate(X):
            print(index)
            ela_X[index] = ela_from_image(file_name)
        return ela_X



model = make_pipeline(
    ELATransformer(),
    RandomForestClassifier(),
)
model.fit_transform(X_train, y_train)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148


IOError: cannot write mode P as JPEG

In [5]:
def plot_roc_curve(model, X, y):
    false_positive_rate, true_positive_rate, thresholds = \
        roc_curve(y, model.predict(X))
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,
             true_positive_rate,
             'blue',
             label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

    print(false_positive_rate, true_positive_rate)

In [6]:
print(model.predict(X_test))
print(y_test)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


NotFittedError: Estimator not fitted, call `fit` before exploiting the model.

In [None]:
plot_roc_curve(model, X=X_test, y=y_test)