# Serverstal Dataset Study

In [None]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
ROOT = "datasets/severstal-steel-defect-detection/"

In [None]:
df = pd.read_csv( ROOT+'/train.csv',
                 delimiter = ',')
df.head()

Unnamed: 0,ImageId,ClassId,EncodedPixels
0,0002cc93b.jpg,1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0007a71bf.jpg,3,18661 28 18863 82 19091 110 19347 110 19603 11...
2,000a4bcdd.jpg,1,37607 3 37858 8 38108 14 38359 20 38610 25 388...
3,000f6bf48.jpg,4,131973 1 132228 4 132483 6 132738 8 132993 11 ...
4,0014fce06.jpg,3,229501 11 229741 33 229981 55 230221 77 230468...


In [None]:
df['ClassId'].value_counts()

3    5150
1     897
4     801
2     247
Name: ClassId, dtype: int64

In [None]:
df['ImageId'].value_counts()[:10]

ef24da2ba.jpg    3
db4867ee8.jpg    3
b68a9259a.jpg    2
db3ce9c95.jpg    2
e30a322d7.jpg    2
9cd1d622e.jpg    2
c44784905.jpg    2
20964a4f0.jpg    2
b5b99c878.jpg    2
bab5271b6.jpg    2
Name: ImageId, dtype: int64

In [None]:
fs = os.listdir( ROOT+"train_images/")
len(fs)

12568

## Train splits

In [None]:
import random
import csv

In [None]:
random.seed(10)

#stratafied split
c1 = df[df['ClassId'] ==1]
c1_list = random.sample(list(c1['ImageId']), 190)

c2 = df[df['ClassId'] ==2]
c2_list = random.sample(list(c2['ImageId']), 45)

c3 = df[df['ClassId'] ==3]
c3_list = random.sample(list(c3['ImageId']), 1000)

c4 = df[df['ClassId'] ==4]
c4_list = random.sample(list(c4['ImageId']), 170)

c0 = set(fs) - set(df['ImageId'])
c0_list = random.sample(list(c0), 1200)

###Classifier dataset

Each image will have defect id in range of 0-4 associated with it. If it's 0 then the image has no defect. 

Also if more than one defect present in an image, then it will be represented as consecutive digits say if an image has class two, four and one defects simulataneously then it will be represented as 124

In [None]:
class_data = []
exempt = []
for k in fs:
    if k in list(df['ImageId']):
        a =  df[df['ImageId'] == k]['ClassId']
        try:
            a = int(a)
        except:
            a = sum(d * 10**i for i, d in enumerate(sorted(list(a),reverse=True)))
        class_data.append([ k, a])
    else:
        class_data.append([ k, 0])

In [None]:
train_split = []
valid_split = []

for c in class_data:
    if c[0] in c0_list+c1_list+c2_list+c3_list+c4_list:
        valid_split.append(c)
    else:
        train_split.append(c)

In [None]:
with open(ROOT+'/steel_train.csv', 'w') as f:
    write = csv.writer(f)      
    write.writerow(['ImageId', 'Class'])
    write.writerows(train_split)

with open(ROOT+'/steel_valid.csv', 'w') as f:
    write = csv.writer(f)      
    write.writerow(['ImageId', 'Class'])
    write.writerows(valid_split)

### Segmentation GT generation

Conversion occurs at dataloader sampling to eliminate storage overhead.

Code shown here for sample creation


In [None]:
import numpy as np

In [None]:
def rle_to_matrix(arr, idx, rle):
    rle = [ int(r) for r in rle.split() ]
    for i in range(0, len(rle), 2):
        lc = rle[i]; ln = rle[i+1]
        arr[idx-1, lc:lc+ln] = 1
    return arr

In [None]:

path = ROOT+'/SampleGT/'
if not os.path.exists(path): os.makedirs(path)

for i in df['ImageId'].unique():
    rs = df[df['ImageId'] == i]
    new = np.zeros((4,256*1600))
    for j, r in rs.iterrows():
        new = rle_to_matrix(new, r['ClassId'], r['EncodedPixels'])
    ## change RLE to image structure
    new = new.reshape(4, 1600, 256) 
    new = new.transpose(0,2,1) 
    np.savetxt(path+ i[:-4] +'.npy', new, fmt='%d')
    ## stop after single sample
    break