In [1]:
import numpy
import matplotlib.pyplot as plt
import cv2
import pandas
import os
import random
import json
from PIL import Image
import requests
from tqdm.notebook import tqdm
# keras lib
from tensorflow import keras
from keras import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, LSTM
import tensorflow
tensorflow.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# prepare data

In [2]:
def get_image_from_url(url):
    im = Image.open(requests.get(url, stream=True).raw)
    return im

with open('devset_images_metadata.json') as file:
    data = json.load(file)
    data = pandas.DataFrame(data['images'])
    data.fillna('-1')
    label = pandas.read_csv('devset_images_gt.csv')
    data['image_id'] = pandas.to_numeric(data['image_id'])
    data = pandas.merge(data, label, how='inner', left_on=['image_id'], right_on=['id'])
    data = data[['id','label','description','user_tags','title','image_url','image_extension_original']]
data.fillna('-1')
data.head()

Unnamed: 0,id,label,description,user_tags,title,image_url,image_extension_original
0,3519864665,0,,"[2009 road trip, obrero road trip]",Biltmore Estate,http://www.flickr.com/photos/95156977@N00/3519...,jpg
1,4896119055,0,,"[daulatabad, daulatabad fort, ellora, road trip]",Chand Minar,http://www.flickr.com/photos/24574470@N00/4896...,jpg
2,3468473862,0,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste...",Uplifting Graffiti,http://www.flickr.com/photos/73451168@N00/3468...,jpg
3,4120853942,0,,"[cork, enchente, flood, ireland, irlanda]",DSCF6487,http://www.flickr.com/photos/12947023@N00/4120...,jpg
4,4436083254,0,,"[athens georgia, brown, current, flood, mud, r...",Oconoe river - flooded,http://www.flickr.com/photos/60704492@N00/4436...,jpg


In [3]:
img_id = data.id
des = data.description
tag = data.user_tags
title = data.title
label = data.label

In [4]:
#convert text to vector.
def vector(txt : str):
    try:
        txt = txt.lower()
        txt = txt.replace('.','')
        txt = txt.replace(',','')
        matrix = txt.split()
        matrix = [[val,matrix.count(val)] for val in set(matrix) \
                  if [val,matrix.count(val)] not in matrix]
        matrix.sort(key = lambda x: x[1], reverse = True)
        return matrix
    except Exception:
        return []

In [5]:
big_mat_words = []
big_mat_num = []
print('vectorize data:')
for i in tqdm(range(len(des))):
    if label[i] != 1:
        continue
    vct = vector(des[i])
    for val in vct:
        if val[0] not in big_mat_words:
            big_mat_words.append(val[0])
            big_mat_num.append(val[1])
            continue
        idx = big_mat_words.index(val[0])
        big_mat_num[idx] += val[1]
print('preparing data!')
big_mat_check = [[big_mat_words[i],big_mat_num[i]] for i in range(len(big_mat_words))]
big_mat_check.sort(key = lambda x: x[1], reverse = True)
print('complete')
len(big_mat_check)

vectorize data:


  0%|          | 0/5280 [00:00<?, ?it/s]

preparing data!
complete


6457

In [6]:
keywords = 'flood flooding water floods flooded river wet rain rains'
keywords = keywords.split()
keywords

['flood',
 'flooding',
 'water',
 'floods',
 'flooded',
 'river',
 'wet',
 'rain',
 'rains']

In [7]:
def encode(text, keywords):
    mat = [0 for _ in keywords]
    try:
        text = text.split()
        for i in range(len(keywords)):
            mat[i] += text.count(keywords[i])
        return numpy.array(mat)
    except Exception:
        return numpy.array(mat)

In [8]:
big_vector = []
print('create big vector:')
for i in tqdm(range(len(des))):
    sm_vct = numpy.array([0 for _ in keywords])
    for j in range(len(sm_vct)):
        sm_vct += encode(des[i], keywords) + encode(tag[i], keywords) + encode(title[i], keywords)
    big_vector.append(sm_vct)
print('preparing')
big_vector = numpy.array(big_vector)
print('complete')
pandas.DataFrame(big_vector).head()

create big vector:


  0%|          | 0/5280 [00:00<?, ?it/s]

preparing
complete


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,9,9,0,0,0


In [9]:
label = numpy.array(label)
label

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [14]:
test_data = pandas.read_csv('test.csv')
test_data.fillna('-1')
test_tt = test_data.title
test_des = test_data.description
test_tag = test_data.user_tags
test_id = test_data.image_id

In [15]:
big_vector = []
print('create big vector:')
for i in tqdm(range(len(test_tt))):
    sm_vct = numpy.array([0 for _ in keywords])
    for j in range(len(sm_vct)):
        sm_vct += encode(test_tag[i], keywords) + encode(test_des[i], keywords) + encode(test_tt[i], keywords)
    big_vector.append(sm_vct)
print('preparing')
X = numpy.array(big_vector)

create big vector:


  0%|          | 0/1320 [00:00<?, ?it/s]

preparing


In [16]:
path = 'C:\\Users\\ADMIN\\Desktop\\document\\projects\\TDH-prj\\testset_images\\testset_images\\'
list_image = os.listdir(path)
test_images = [path + str(val)+'.jpg' for val in test_data['image_id']]
plt.imshow(cv2.imread(test_images[0]))

TypeError: Can't convert object to 'str' for 'filename'

In [13]:
image_model = keras.models.load_model('image.h5')
prediction_image = []
for val in tqdm(test_images):
    try:
        img = cv2.imread(val)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (224,224))
        prediction_image.append(image_model.predict(numpy.array([img]),verbose=False)[0])
    except Exception:
        prediction_image.append([0,0])

  0%|          | 0/1320 [00:00<?, ?it/s]

In [19]:
prediction_image = numpy.array(prediction_image)
model = keras.models.load_model('text.h5')

In [26]:
prediction_text = model.predict(X)
prediction = (prediction_image)*5/10 + (prediction_text)*5/10
prediction = [numpy.argmax(val) for val in prediction]



In [28]:
pandas.DataFrame({'id':test_data['image_id'], 'label':prediction}).to_csv('op5.csv', index=False)