<a href="https://colab.research.google.com/github/shadfdz/CCTV/blob/main/CCTV_anomalydetection_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PreProcessing and Data Exploration (Supervised)


> The CCTV dataset is composed of 11 folders containing flood control inspection footage of different areas in Imperial City grouped by year althought some years have multiplke folders. Some folders contain video screenahots of anomalies in the strcucture (i.e roots, cracks, vermin), which where then used to generate pdf reports which are contained in the folder. The folder 'CCTV_2014_10_Downstream' was selected since it contained detailed pdf reports of each manhole footage as well as better quality screenshots compared to other years (some years do not have detailed reports and are unstructured). The folder contains 25 separate video clips ranging from 6-18 minutes.

In [None]:
# import libraries
# %config Completer.use_jedi = False
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import pickle
import pandas as pd
import glob

from google.colab import drive
from google.colab.patches import cv2_imshow
from random import sample

drive.mount('/content/drive',force_remount=True)
# !ls drive/MyDrive/CCTV/CCTV_2014_10_Downstream

Mounted at /content/drive


In [None]:
# find cctv videos in CCTV subfolder - 2014 Downstream
cctv_folder_path = 'drive/MyDrive/CCTV/'
cctv_subfolder = "CCTV_2014_10_Downstream/"
cctv_filenames = []
for file in os.listdir(cctv_folder_path + cctv_subfolder):
  if file.endswith(".wmv"):
    print(file)
    cctv_filenames.append(file)

1_DE_107_346.wmv
1_578_346.wmv
1_346_332.wmv
1_DE_111_597.wmv
1_597_688.wmv
1_348_360.wmv
1_350_362.wmv
1_352_363.wmv
1_354_364.wmv
1_DE_101_336.wmv
1_336_337.wmv
1_341_302.wmv
1_DE_110_355.wmv
1_355_342.wmv
1_DE_165_552.wmv
1_552_531.wmv
1_531_530.wmv
1_530_367.wmv
1_342_341.wmv
1_341_302_1.wmv
1_341_302_1_2.wmv
1_523_425.wmv
1_524_523.wmv
1_525_524.wmv
1_529_528.wmv
1_528_526.wmv


# Data Exploration and Sampling

> Sample each video every second and export to google drive



In [None]:
# import sample footage
f_name = cctv_filenames[20]
vid_raw = cv2.VideoCapture(cctv_folder_path + cctv_subfolder + f_name)

# define function to import multiple mpg files - scale up

In [None]:
# examine frame properties of video
if (vid_raw.isOpened() == False):
    print('Video Import Not Successfull')

else: 
  frame_count = vid_raw.get(cv2.CAP_PROP_FRAME_COUNT)
  print('Frame Count: {}'.format(frame_count))

  fps = vid_raw.get(cv2.CAP_PROP_FPS)
  print('FPS: {}'.format(fps))

  length = frame_count / ( fps )
  print('Length: {:.2f} minutes'.format(length))

  width = vid_raw.get(cv2.CAP_PROP_FRAME_WIDTH)
  height = vid_raw.get(cv2.CAP_PROP_FRAME_HEIGHT)
  print('Width: {}px, Height: {}px'.format(width, height))


Frame Count: 10874.0
FPS: 29.97002997002997
Length: 362.83 minutes
Width: 640.0px, Height: 480.0px


In [None]:
# create a function to caputre dataframes
def video_to_frames(vid_cap, directory, output_folder, f_name, subdirectory=False):
  """
  function retreives video caputre instance and saves video frames per second in a 
  subdirectory named after video file. file saved as jpg with video file name + sec
  data frame returned with file names of frames 
  :param vid_cap: instance of cv2 video capture
  :param output_folder: str of output folder name
  :param directory: str directory path
  :param f_name: str cctv filename
  :return: dataframe
  """
  # retrieve if frame exists
  frame_exists, frame = vid_cap.read()
  # to hold data for df
  frames_data_array = []
  # get vid properties
  width = vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)
  height = vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
  length = int(vid_cap.get(cv2.CAP_PROP_FRAME_COUNT) / vid_cap.get(cv2.CAP_PROP_FPS))
  path = directory + output_folder

  if subdirectory:
    os.mkdir(directory + output_folder + f_name + '/')
    path = directory + output_folder + f_name + '/'

  # loop through frames
  for i in range(length):

    vid_cap.set(cv2.CAP_PROP_POS_MSEC, i * 1000)
    frame_exists, frame = vid_cap.read()

    # save in drive
    frame_name = f_name + '_' + str(i) + '.jpg'
    cv2.imwrite(path + frame_name, frame)

    frame_data = [f_name, frame_name, width, height, i]
    frames_data_array.append(frame_data)
    
  # create df
  col_names = ['file_name','frame_name', 'width', 'height', 'seconds']
  df_frames = pd.DataFrame(data=frames_data_array, columns=col_names)

  return df_frames

In [None]:
# export frames to subdirectory
in_directory_path = 'drive/MyDrive/CCTV/CCTV_2014_10_Downstream/'
out_directory_path = 'drive/MyDrive/CCTV_Frames_2014_Downstream/'
f_name = cctv_filenames[0]
sub_dir = cctv_filenames[0].replace('.wmv','/')

if not os.path.exists(out_directory_path + sub_dir):
  os.mkdir(out_directory_path + sub_dir)

vid_raw = cv2.VideoCapture(in_directory_path + f_name)

df_test = video_to_frames(vid_raw, out_directory_path, sub_dir, f_name)


drive/MyDrive/CCTV_Frames_2014_Downstream/1_DE_107_346/
True
1155


In [None]:
# save frames of every video in its own unique subdirectory
# create a pickle file w/ info of each frame and save in df pickle directory
in_directory_path = 'drive/MyDrive/CCTV/CCTV_2014_10_Downstream/'
out_directory_path = 'drive/MyDrive/CCTV_Frames_2014_Downstream/'
pickle_directory_path = 'drive/MyDrive/CCTV_df_pickle/'

for f_name in cctv_filenames:
  # create sub directory name
  sub_dir = f_name.replace('.wmv','/')
  # create sub directory
  if not os.path.exists(out_directory_path + sub_dir):
    os.mkdir(out_directory_path + sub_dir)
  # read video
  vid_raw = cv2.VideoCapture(in_directory_path + f_name)
  # get frames and get df to pickle
  df_frames = video_to_frames(vid_raw, out_directory_path, sub_dir, f_name)

  pickle_fname = f_name.replace('.wmv','.pkl')
  df_frames.to_pickle(pickle_directory_path + pickle_fname)

### Export photos from folder to dedicated subfolder in google drive

In [None]:
# run a shell command to transfer photos
#!find drive/MyDrive/CCTV/CCTV_2014_10_Downstream/. -name '*.jpg' -exec cp '{}' drive/MyDrive/2014_Downstream_Pics \;

In [None]:
# us os path to get photo names with anomaly
cat_list = []
pic_path = 'drive/MyDrive/CCTV_Pics'
for file in os.listdir(pic_path):
  if file.endswith(".jpg"):
    cat_list.append(file)

In [None]:
for i in cat_list:
  print(re.sub(r'[0','',i))

### Next Step is to Annotate Photos and Frames for Training and Validation

* OpenLabeling Annotation tool [git](https://github.com/Cartucho/OpenLabeling)

### Check instances of each class category 


---
0 - root
1 - crack
2 - grease


In [None]:
# os.chdir(os.path.dirname(os.getcwd()))

label_list = glob.glob('drive/MyDrive/cctv_train_25MAY2022_labels/YOLO_darknet/*')

label_count_dict = { '0': 0, '1': 0, '2': 0 }

for label in label_list:
  with open(label) as f_reader:
    annotations = f_reader.readlines()
  for line in annotations:
    if line:
      label_count_dict[line[0]] += 1

label_count_dict

{'0': 329, '1': 161, '2': 216}

### Sample Other Datasets for Cracks

In [None]:
# !ls drive/MyDrive/Cracked/
crack_f_list = glob.glob('drive/MyDrive/Cracked/*')
sample_list = sample(crack_f_list, 100)
output_folder = 'drive/MyDrive/cracked_processed/'

img = cv2.imread(sample_list[14])
test = np.zeros((480,640,3))
test[112:368,192:448,:]=img

print(sample_list[0])
for f_name in sample_list:
  img = cv2.imread(f_name)
  canvas = np.zeros((480,640,3))
  canvas[112:368,192:448,:] = img
  cv2.imwrite(output_folder + f_name.split('/')[3].split('.')[0] + '.jpg', canvas)



drive/MyDrive/Cracked/7041-31.jpg


# Preprocessing for Anomaly Detection Dataset (Unsupervised)

Another possible method for anomaly detection in flood control structures are Convolutional Autoencoders. This unsupervised learning method will save time by minimizing data labeling in videos. The CCTV_2014_12_Tran_Allcity data set contains a report of the anomaly/repair requiremnet score for each video footage. We can train our model using footage of flood control that do not require maintenance

In [None]:
# export excel sheet with CCTV Pipe DAta
report_file_path = 'drive/MyDrive/CCTV/CCTV_2014_12_Tran_AllCity/report/CCTV_PipeData.xlsx'

df_report = pd.read_excel(report_file_path)

In [None]:
df_report.dtypes

PIPE ID                                int64
US MH ID                              object
US MH ELEVATION                        int64
US IE                                float64
DS MH ID                              object
DS MH ELEVATION                        int64
DS IE                                float64
INSPECT FT                             int64
MAP LENGTH                             int64
SIZE                                  object
MATERIAL                              object
DIRECTION                             object
OPERATOR                              object
START FT                               int64
END FT                                 int64
INSPECT DATE                          object
INSPECT TIME                          object
QC DATE                       datetime64[ns]
STREET                                object
CROSS ST                              object
FIELD PAGE                            object
ZIP CODE                               int64
JOINT SPAC

In [None]:
df_report.head()

Unnamed: 0,PIPE ID,US MH ID,US MH ELEVATION,US IE,DS MH ID,DS MH ELEVATION,DS IE,INSPECT FT,MAP LENGTH,SIZE,...,JOINT SPACE,VIDEO NAME,COMMENT,PRIORITIZATION CODE,RECOMMENDATION CODE,MAINTENANCE LENGTH,REPAIR/REPLACEMENT LENGTH,REHABILITATION LENGTH,TOTAL POINTS (MAINTENANCE),TOTAL POINTS (STRUCTURAL)
0,15,MH-167,0,6.08,MH-22,0,5.49,228,116,10 in,...,3,15_MH-22_20141117,"FACILITY DATABASE SHOWS PIPE LENGTH IS 116 FT,...",3,B & D,218,10,0,75,250
1,49,MH-28,0,7.554,WW-1A,0,4.84,347,341,8 in,...,3,49_MH-28_20141106,0,2,"B, C & D",337,10,0,740,1050
2,50,MH-29,0,10.285,MH-28,0,7.584,314,312,8 in,...,3,50_MH-29_20141106,0,1,B & C,314,0,0,175,875
3,51,MH-30,0,12.996,MH-29,0,10.315,333,332,6 in,...,3,51_MH-29_20141106,0,1,B & C,333,0,0,125,675
4,52,MH-31,0,15.238,MH-30,0,13.026,335,333,6 in,...,3,52_MH-31_20141031,0,3,B & D,315,20,0,600,1125


In [None]:
df_report.loc[(df_report['MATERIAL']=='VCP') & (df_report['TOTAL POINTS (MAINTENANCE)']==0) & (df_report['TOTAL POINTS (STRUCTURAL)']==0) & (df_report['VIDEO NAME'] != 0),:]

Unnamed: 0,PIPE ID,US MH ID,US MH ELEVATION,US IE,DS MH ID,DS MH ELEVATION,DS IE,INSPECT FT,MAP LENGTH,SIZE,...,JOINT SPACE,VIDEO NAME,COMMENT,PRIORITIZATION CODE,RECOMMENDATION CODE,MAINTENANCE LENGTH,REPAIR/REPLACEMENT LENGTH,REHABILITATION LENGTH,TOTAL POINTS (MAINTENANCE),TOTAL POINTS (STRUCTURAL)
96,230,MH-167,0,6.08,MH-168,0,6.059,6,12,10 in,...,3,230_MH-167_20141117,0,1,A,0,0,0,0,0
103,240,MH-175,0,2.49,WW-3,0,2.3,4,16,10 in,...,3,240_MH-175_20141117,0,1,A,0,0,0,0,0
104,275,MH-735,0,-2.531,WW-5,0,-4.96,3,25,10 in,...,3,275_MH-735_20141117,0,1,A,0,0,0,0,0
163,386,DE-80,0,10.04,MH-258,0,9.467,138,134,8 in,...,3,386_MH-258_20141119,0,3,D,0,10,0,0,0
200,431,DE-87,0,18.28,MH-285,0,17.768,86,90,8 in,...,3,431_MH-285_20141107,0,1,A,0,0,0,0,0
209,449,MH-681,0,15.365,MH-241,0,12.136,379,375,15 in,...,5,449_MH-681_20141105,0,1,A,0,0,0,0,0
240,627,MH-679,0,14.024,MH-680,0,12.44,11,12,8 in,...,5,627_MH-679_20141106,0,1,A,0,0,0,0,0
496,606,MH-427,0,4.87,MH-426,0,4.628,19,19,8 in,...,3,606_MH-426_20141218,0,1,A,0,0,0,0,0
524,671,MH-585,0,14.78,MH-584,0,13.823,355,350,8 in,...,3,671_MH-584_20141112,0,1,A,0,0,0,0,0
557,705,MH-554,0,18.888,MH-636,0,16.26,46,47,8 in,...,3,705_MH-636_20141124,0,1,A,0,0,0,0,0


In [None]:
# get video filenames of pipes with 0 structure and maintenance points and are made of VCP
cctv_filenames_ad = df_report.loc[(df_report['MATERIAL']=='VCP') & (df_report['TOTAL POINTS (MAINTENANCE)']==0) & (df_report['TOTAL POINTS (STRUCTURAL)']==0) & (df_report['VIDEO NAME'] != 0),'VIDEO NAME'] + '.MPG'

In [None]:
cctv_filenames_ad

96     230_MH-167_20141117.MPG
103    240_MH-175_20141117.MPG
104    275_MH-735_20141117.MPG
163    386_MH-258_20141119.MPG
200    431_MH-285_20141107.MPG
209    449_MH-681_20141105.MPG
240    627_MH-679_20141106.MPG
496    606_MH-426_20141218.MPG
524    671_MH-584_20141112.MPG
557    705_MH-636_20141124.MPG
562    712_MH-546_20141202.MPG
590    753_MH-481_20141201.MPG
591    754_MH-479_20141217.MPG
Name: VIDEO NAME, dtype: object

In [None]:
# # create folder for CCTV Frames
# !rm -rf drive/MyDrive/CCTV_Frames_2014_Tran_AllCity
# !mkdir drive/MyDrive/CCTV_Frames_2014_Tran_AllCity

In [None]:
in_directory_path = 'drive/MyDrive/CCTV/CCTV_2014_12_Tran_AllCity/'
out_directory_path = 'drive/MyDrive/CCTV_Frames_2014_Tran_AllCity/'
sub_dir = 'ad_'

for f_name in cctv_filenames_ad[5:]:
  # create sub directory name
  print(in_directory_path + f_name)

  vid_raw = cv2.VideoCapture(in_directory_path + f_name)
  video_to_frames(vid_raw, out_directory_path, sub_dir, f_name, subdirectory=True)


In [None]:
 !ls -l drive/MyDrive/CCTV_Frames_2014_Tran_AllCity | wc -l

7


In [None]:
# create a function to capture dataframes
def video_to_frames(start, end, vid_cap, output_folder, f_name, sampling_rate=5):
  """
  function retreives video caputre instance and saves video frames per second in a 
  subdirectory named after video file. file saved as jpg with video file name + sec
  data frame returned with file names of frames 
  :param vid_cap: instance of cv2 video capture
  :param output_folder: str of output folder name
  :param directory: str directory path
  :param f_name: str cctv filename
  :return: dataframe
  """
  # retrieve if frame exists
  frame_exists, frame = vid_cap.read()
  # to hold data for df
  frames_data_array = []
  
  # get vid properties
  length = int(vid_cap.get(cv2.CAP_PROP_FRAME_COUNT) / vid_cap.get(cv2.CAP_PROP_FPS))
  # path = directory + output_folder

  # loop through frames
  for i in range(end-start+1):
    for j in range(1,sampling_rate+1,1):

      vid_cap.set(cv2.CAP_PROP_POS_MSEC, (((start+i) + (j/sampling_rate)) * 1000))
      frame_exists, frame = vid_cap.read()

      # save in drive
      frame_name = f_name + '_' + str(start+i) + '_' + str(j) + '.jpg'
      cv2.imwrite(output_folder + frame_name, frame)

  print('success')

In [None]:
!ls drive/MyDrive/CCTV/CCTV_2014_10_Downstream/*wmv

drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_336_337.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_341_302_1_2.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_341_302_1.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_341_302.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_342_341.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_346_332.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_348_360.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_350_362.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_352_363.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_354_364.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_355_342.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_523_425.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_524_523.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_525_524.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_528_526.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_529_528.wmv
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_530_367.wmv
drive/MyDrive/CCTV/CCTV_2

In [None]:
input_path = 'drive/MyDrive/CCTV/CCTV_2014_10_Downstream/'
output_path = 'drive/MyDrive/cae_processed_dataset5/'

In [None]:
splice_dict = {
    '1_336_337.wmv': [(398,401), (500,513)],
    '1_341_302.wmv': [(384,432), (514,524)],
    '1_348_360.wmv': [(561,581)],
    '1_350_362.wmv': [(667,673), (853,855)],
    '1_354_364.wmv': [(477,488), (536,592)],
    '1_523_425.wmv': [(126,139), (165,188), (208,252), (388,395)],
    '1_524_523.wmv': [(73,83), (117,126), (164,169), (624,653)],
    '1_525_524.wmv': [(52,116), (260,304)],
    '1_528_526.wmv': [(56,58), (72,76), (297,307), (323,331), (389,411)],
    '1_552_531.wmv': [(540,548), (588,613), (483,515)], #
    '1_DE_101_336.wmv': [(181,213),(408,413)],
    '1_DE_107_346.wmv': [(247,254), (380,397), (488,499), (597,619), (668,703), (729,750),(945,980)],
    '1_DE_165_552.wmv': [(242,256)]
}

In [None]:
splice_dict2 = {
    '1_336_337.wmv': [(62,80), (499,507), (748,755)],
    '1_341_302.wmv': [(361,377),(406,427),(514,519)],
    '1_348_360.wmv': [(561,577)],
    '1_341_302_1.wmv': [(48,57), (251,307)],
    '1_342_341.wmv': [(248,270), (402,417)],
    '1_350_362.wmv': [(667,673), (853,84)],
    '1_354_364.wmv': [(477,488), (551,581)],
    '1_355_342.wmv': [(661,683), (733,771)],
    '1_524_523.wmv': [(73,81), (174,177)],
    '1_525_524.wmv': [(66,116), (260,269)],
    '1_528_526.wmv': [(72,76), (300,307), (323,331), (389,411)],
    '1_529_528.wmv': [(261,284)],
    '1_552_531.wmv': [(542,548), (588,604), (483,515)],
    '1_DE_101_336.wmv': [(181,213), (408,413)],
    '1_DE_107_346.wmv': [(248,254), (382,409), (497,509), (597,619), (668,697), (730,765), (951,979)],
    '1_DE_111_597.wmv': [(250,258)],
    '1_DE_165_552.wmv': [(242,254)]
}



In [None]:
# test on one
# video_to_frames(start, end, vid_cap, output_folder, f_name):
f_name = '1_336_337.wmv'
start = 398
end = 401
vid_raw = cv2.VideoCapture(input_path + f_name)
video_to_frames(start, end, vid_raw, output_path, f_name)

success


In [None]:
!rm -rf drive/MyDrive/cae_folder_segmented/*

In [None]:
output_path

'drive/MyDrive/cae_processed_dataset5/'

In [None]:
for name in splice_dict2.keys():
  print(input_path + name)
  file_path = input_path + name
  vid_raw = cv2.VideoCapture(file_path)
  for times in splice_dict2[name]:
    start = times[0]    
    end = times[1]
    print(start, end)
    video_to_frames(start, end, vid_raw, output_path, name, sampling_rate=10)

drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_336_337.wmv
62 80
success
499 507
success
748 755
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_341_302.wmv
361 377
success
406 427
success
514 519
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_348_360.wmv
561 577
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_341_302_1.wmv
48 57
success
251 307
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_342_341.wmv
248 270
success
402 417
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_350_362.wmv
667 673
success
853 84
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_354_364.wmv
477 488
success
551 581
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_355_342.wmv
661 683
success
733 771
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_524_523.wmv
73 81
success
174 177
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_525_524.wmv
66 116
success
260 269
success
drive/MyDrive/CCTV/CCTV_2014_10_Downstream/1_528_526.wmv
72 76
success
300 307
success
323 331
succe

In [None]:
!ls drive/MyDrive/cae_processed_dataset5 | wc -l

7430


# Preprocess Video for YOLO Test

In [None]:
!ls drive/MyDrive/cae_vid_test/

In [None]:
f_path = 'drive/MyDrive/CCTV/CCTV_2013_05_Various/MH308-MH315-08090513.m4v'
f_name = 'MH308-MH315-08090513.wmv'
out_path = 'drive/MyDrive/CCTV_Frames_2013_05_Various/'

In [None]:
vid_cap = cv2.VideoCapture(f_path)
length = int(vid_cap.get(cv2.CAP_PROP_FRAME_COUNT) / vid_cap.get(cv2.CAP_PROP_FPS))

video_to_frames(270, 360, vid_cap, out_path, f_name, 2)

success
