# Imports

In [1]:
import pandas as pd
import cv2
import os
import glob
import xlrd
import numpy as np
from pathlib import Path
from datetime import datetime
import numpy as np
import imgaug as ia
import imgaug.augmenters as iaa
import logging
import random
import tqdm

import sys

pd.options.display.max_rows = 4000
pd.options.display.max_seq_items = 2000

# Extract information from excel to dataframe

Data are distributed in multiple excel files (one excel sheet). I need to combine those excel files together to dataframe.

In [2]:
list_dfs = []

# ignore bad excels
#ignore_excels = ['Log files/CCTV survey 115732.xlsx', 'Log files/CCTV survey 11424.xlsx',
#                'Log files/CCTV survey 114458.xlsx', 'Log files/CCTV survey 118757.xlsx', 'Log files/CCTV survey 117306.xlsx']

# allow good excels only
#allowed_excels = ['Log files/CCTV survey 120320.xlsx', 'Log files/CCTV survey 115480.xlsx', 'Log files/CCTV survey 120392.xlsx']

for file in glob.iglob('Log files/*.xlsx'):
    if ('~' in file):
        continue
    print(file)
    #if not (file in allowed_excels):
    #    continue
        
    xls = xlrd.open_workbook(file, on_demand=True)
    for sheet_name in xls.sheet_names():
        df = pd.read_excel(file, sheet_name, engine='xlrd')
        df['Sheet Name'] = sheet_name
        df['Source File'] = file
        df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
        list_dfs.append(df)
    
df = pd.concat(list_dfs, axis=0, sort=False)
df.reset_index(inplace=True)

Log files/CCTV SterteAvenue West.xlsx
Log files/CCTV SterteAvenue.xlsx
Log files/CCTV survey 11065.xlsx
Log files/CCTV survey 11424.xlsx
Log files/CCTV survey 114246.xlsx
Log files/CCTV survey 114458.xlsx
Log files/CCTV survey 115480.xlsx
Log files/CCTV survey 115732.xlsx
Log files/CCTV survey 116758.xlsx
Log files/CCTV survey 116814.xlsx
Log files/CCTV survey 116892.xlsx
Log files/CCTV survey 117306.xlsx
Log files/CCTV survey 118060.xlsx
Log files/CCTV survey 118757.xlsx
Log files/CCTV survey 118880.xlsx
Log files/CCTV survey 119452.xlsx
Log files/CCTV survey 120320.xlsx
Log files/CCTV survey 120392.xlsx
Log files/CCTV survey 120681.xlsx


Only some column are needed for processing.

In [3]:
# get valid columns and rename them

df = df[['ns1:VideoRef', 'ns1:Distance', 'ns1:ContinuousDefect', 'ns1:Code', 'ns1:Remarks', 'Source File']]
df.columns = ['Time', 'Distance', 'ContinuousDefect', 'DefectType', 'Remarks', 'Source File']

In [4]:
df.head()

Unnamed: 0,Time,Distance,ContinuousDefect,DefectType,Remarks,Source File
0,0:00:00,0.0,,VVR,SZ00919605_SZ00919603_U_150915_1.mpg,Log files/CCTV SterteAvenue West.xlsx
1,0:00:00,0.0,,MH,,Log files/CCTV SterteAvenue West.xlsx
2,0:00:03,0.01,,WL,,Log files/CCTV SterteAvenue West.xlsx
3,0:00:30,0.98,,DEE,,Log files/CCTV SterteAvenue West.xlsx
4,0:00:51,2.38,S01,DEE,,Log files/CCTV SterteAvenue West.xlsx


# Append video file names to dataframe

For every record, video file names are noted in column Remarks but only in 1st occurence. I need to extract those data and create new column with full path to source video file for every record.

In [5]:
video_files_dir = 'Video files/'

In [6]:
# read video files names

videos = []

for entry in os.listdir(video_files_dir):
    if os.path.isfile(os.path.join(video_files_dir, entry)):
        videos.append(entry)

In [7]:
len(videos)

285

In [8]:
# create column with video name path for every defect

df['Video file'] = df['Remarks']

not_found_videos = []

for index in range(0, df.shape[0]):
    video_file = df.at[index, 'Video file']
    if video_file is np.NaN or not video_file.endswith('.mpg'):
        video_file = df.at[index-1, 'Video file']
        if not video_file.endswith('.mpg'):
            raise Exception("Previous video info is not video file: " + video_file)
        else:
            df.at[index, 'Video file'] = video_file
    else:
        if video_file in videos:
            df.at[index, 'Video file'] = video_file
        else:
            not_found_videos.append(video_file)

In [9]:
len(not_found_videos)

103

In [10]:
# remove defects with missing video file

df = df[~df['Video file'].isin(not_found_videos)]
df.reset_index(drop=True, inplace=True)

In [11]:
len(videos)

285

In [12]:
# remove useless column

df.drop('Remarks', axis=1, inplace=True)

In [13]:
df.shape[0]

4103

In [14]:
df.head(10)

Unnamed: 0,Time,Distance,ContinuousDefect,DefectType,Source File,Video file
0,0:00:00,0.0,,VVR,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
1,0:00:00,0.0,,MH,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
2,0:00:03,0.0,,WL,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
3,0:00:19,0.0,S01,DEG,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
4,0:00:51,1.66,,DER,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
5,0:01:12,3.63,S02,DER,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
6,0:01:19,3.63,,WL,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
7,0:01:35,5.62,,WL,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
8,0:01:44,6.22,F02,DER,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
9,0:03:08,8.56,,H,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg


# Fix invalid time in data

Some data record contains 0 timetamps. I recognize these records as invalid. Get rid of them.

In [15]:
# NOTE: I cant remove those record because when recognizing healthy images, I have to check against those times.
#df = df[df['Time'] != "0:00:00"]

In [16]:
df.shape[0]

4103

In [17]:
df.head()

Unnamed: 0,Time,Distance,ContinuousDefect,DefectType,Source File,Video file
0,0:00:00,0.0,,VVR,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
1,0:00:00,0.0,,MH,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
2,0:00:03,0.0,,WL,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
3,0:00:19,0.0,S01,DEG,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg
4,0:00:51,1.66,,DER,Log files/CCTV survey 11065.xlsx,ST52223602_ST52222702_D_091012_1.mpg


# Extract images with defects

For classification, it is necessary to extract images with defects and place them in corresponding folders.

In [18]:
df_continuous_defects = pd.DataFrame(columns=['StartTime', 'EndTime', 'DefectType', 'SourceFile', 'Video file'])

In [19]:
video_root_dir = 'Video files/'
defects_root_dir = 'defect_images/'

In [20]:
df_group_videofile = df.groupby('Video file')

In dataset, there are continous defects in continous frames. I need to create new dataframe with they starting and ending times.

In [21]:
for name, group in df_group_videofile:
    
    video = cv2.VideoCapture(video_root_dir + name)
    fps = video.get(cv2.CAP_PROP_FPS)
    group = group[group['ContinuousDefect'].notnull()]
    
    for index, row in group.iterrows():
        group.loc[index,'ContinuousDefect'] = group.loc[index,'ContinuousDefect'][1:]
    
    defect_group = group.groupby('ContinuousDefect', sort='Time')
    
    for t, group in defect_group:
        
        # check if pairing is missing
        if len(group) == 1:
            logging.warning(f'Continous defect without start/end {t} in video file {video_root_dir + name}')
            continue
        first = group.iloc[0]
        second = group.iloc[1]
        row = [first['Time'], second['Time'], first['DefectType'], first['Source File'], first['Video file']]
        df_continuous_defects.loc[len(df_continuous_defects)] = row

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [22]:
len(df_continuous_defects)

279

In [23]:
df_continuous_defects.head()

Unnamed: 0,StartTime,EndTime,DefectType,SourceFile,Video file
0,0:00:00,0:00:00,DEF,Log files/CCTV survey 115732.xlsx,001_SU14056002_SU14056001_14112013_0936.mpg
1,0:00:00,0:00:00,DEG,Log files/CCTV survey 115732.xlsx,001_SU14056002_SU14056001_14112013_0936.mpg
2,0:00:00,0:00:00,DES,Log files/CCTV survey 115732.xlsx,001_SU14056002_SU14056001_14112013_0936.mpg
3,0:00:00,0:00:00,DEG,Log files/CCTV survey 11424.xlsx,001_SZ01900402_SZ01900405_28022013_0101.mpg
4,0:00:00,0:00:00,DES,Log files/CCTV survey 115732.xlsx,002_SU14056001_SU14047901_14112013_0944.mpg


In [24]:
defect_images = []

From those continous defects, I extract only some frames.

In [25]:
def time_to_sec(time):
    return sum(x * int(t) for x, t in zip([3600, 60, 1], time.split(":")))

In [26]:
period_of_extraction_in_sec = 5

df_group_continuous_defects_video_file = df_continuous_defects.groupby('Video file')

for name, group in df_group_continuous_defects_video_file:
    video = cv2.VideoCapture(video_root_dir + name)
    fps = video.get(cv2.CAP_PROP_FPS)
    for index, row in group[['StartTime', 'EndTime', 'DefectType']].iterrows():
        seconds_start = time_to_sec(row['StartTime'])
        seconds_end = time_to_sec(row['EndTime'])
        
        if seconds_start < 25:
            seconds_start = 25
        
        for seconds in range(seconds_start, seconds_end, period_of_extraction_in_sec):
            video.set(cv2.CAP_PROP_POS_FRAMES, seconds * fps);
            ret, frame = video.read()
            if not ret:
                logging.warning(f'Unexpected error at capturing image at {seconds} seconds in video file {video_root_dir + name}')
                continue
            defect_dir = defects_root_dir + row['DefectType']
            if not os.path.exists(defect_dir):
                os.makedirs(defect_dir)
            out_name = defect_dir + os.sep + name + '_continous_' + str(seconds) + '.jpg'
            defect_images.append(out_name)
            cv2.imwrite(out_name, frame)
    video.release()

In [27]:
len(defect_images)

13119

Extract noncontinous defects.

In [28]:
for name, group in df_group_videofile:
    video = cv2.VideoCapture(video_root_dir + name)
    fps = video.get(cv2.CAP_PROP_FPS)
    for index, row in group[['Time', 'ContinuousDefect', 'DefectType']].iterrows():
        if row['ContinuousDefect'] is not np.NaN:
            continue
        seconds = time_to_sec(row['Time'])
        
        if seconds < 20:
            continue
        
        video.set(cv2.CAP_PROP_POS_FRAMES, seconds * fps);
        ret, frame = video.read()
        
        # ignore invalid images
        if not ret:
            logging.warning(f'Unexpected error at capturing image at {seconds} seconds in video file {video_root_dir + name}')
            continue
        defect_dir = defects_root_dir
        if not os.path.exists(defect_dir):
            os.makedirs(defect_dir)
        out_name = defect_dir + os.sep + name + '_' + row['Time'] + '.jpg'
        defect_images.append(out_name)
        cv2.imwrite(out_name, frame)
        break
    video.release()




In [29]:
len(defect_images)

13311

In [30]:
del defect_images

# Extraction of healthy images

Extraction of healthy images extract all images except those in restricted times where defects appear.

In [31]:
healthy_images_root_dir = 'healthy_images'
period_of_extraction_in_sec = 1

In [32]:
# extraction times bounds

excluded_seconds_before_after_continous = 5
excluded_seconds_before_after_noncontinous = 45

In [33]:
healthy_images = []

for name, group in df_group_videofile:
    video = cv2.VideoCapture(video_root_dir + name)
    fps = video.get(cv2.CAP_PROP_FPS)
    invalid_times = []
    invalid_sequences = []
    
    # create invalid intervals
    for index, row in group[['Time', 'DefectType']].iterrows():
        seconds = time_to_sec(row['Time'])
        invalid_times.append(seconds)
        
    if (name in df_group_continuous_defects_video_file.groups.keys()):
        for i in df_group_continuous_defects_video_file.get_group(name).iterrows():
            invalid_sequences.append([time_to_sec(i[1]['StartTime']), time_to_sec(i[1]['EndTime'])])  
    
    video_length = int(int(video.get(cv2.CAP_PROP_FRAME_COUNT))/fps)
    
    # first 25 seconds are info about record
    invalid_sequences.append([0, 25])
    
    invalid_times.sort()
    
    # ignore everyting from last defect
    invalid_sequences.append([invalid_times[len(invalid_times) - 1], video_length])
    
    healthy_sec = []
    
    sec = .5
    while sec < video_length:
        fail = False
        
        for time in invalid_times:
            if (abs(time - sec) < excluded_seconds_before_after_noncontinous):
                fail = True
                break
                
        for t in invalid_sequences:
            if (sec > t[0] - excluded_seconds_before_after_continous and sec < t[1] + excluded_seconds_before_after_continous):
                fail = True
                break
        
        if not fail:
            video.set(cv2.CAP_PROP_POS_FRAMES, sec * fps);
            ret, frame = video.read()
            if not ret:
                logging.warning(f'Unexpected error at capturing image at {seconds} seconds in video file {video_root_dir + name}')
                continue
            if not os.path.exists(healthy_images_root_dir):
                os.makedirs(healthy_images_root_dir)
            out_name = healthy_images_root_dir + '/' + name + '_' + str(sec) + '.jpg'
            healthy_images.append(out_name)
            cv2.imwrite(out_name, frame)
            healthy_sec.append(sec)
        sec += 1.4
    video.release()

In [34]:
len(healthy_images)

13157