In [1]:
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
import numpy as np
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import math
import os
from tqdm import tqdm
from pytesseract.pytesseract import TesseractError

In [2]:
'''
This code cell contains format checking functions
'''
def checkformat(s):
    '''
    Check the format of the direction and height features
    '''
    n = len(s)
    news = '';
    for i in range(n):
        news = news + s[i]
        if s[i] == '°':
            break;
    pos1 = i
    count1 = 0;
    for i in range(pos1+1,n):
        if (s[i] >= '0') and (s[i] <= '9'):
            count1 = count1 + 1
            if (count1 <= 2):
                news = news + s[i]
            else:
                break
    pos1 = i
    news = news + "'"
    count2 = 0;
    for i in range(pos1,n):
        if (s[i] >= '0') and (s[i] <= '9'):
            count2 = count2 + 1
            if (count2 <= 2):
                news = news + s[i]
            else:
                break
    news = news + "."
    pos1 = i
    count3 = 0;
    for i in range(pos1,n):
        if (s[i] >= '0') and (s[i] <= '9'):
            count3 = count3 + 1
            if (count3 <= 2):
                news = news + s[i]
            else:
                break
    news = news + '"'
    return news
    
def checkformat_phase(s):
    '''
    check the format of the moon's phase feature
    '''
    n = len(s)
    news = ''
    count1 = 0
    for i in range(n):
        if (s[i] >= '0') and (s[i] <= '9'):
            count1 = count1 + 1
            if (count1 <= 1):
                news = news + s[i]
            else:
                break
    pos1 = i
    news = news + '.'
    count2 = 0
    for i in range(pos1,n):
        if (s[i] >= '0') and (s[i] <= '9'):
            count2 = count2 + 1
            if (count1 <= 3):
                news = news + s[i]
            else:
                break
    return news
    
def checkdate(s):
    '''
    check the format of date feature including year, month, and day
    '''
    n = len(s)
    news = '';
    count1 = 0;
    for i in range(n):
        if (s[i] >= '0') and (s[i] <= '9'):
            count1 = count1 + 1
            if (count1 <= 1):
                news = news + s[i]
            else:
                break
    pos1 = i
    news = news + "."
    count2 = 0;
    for i in range(pos1,n):
        if (s[i] >= '0') and (s[i] <= '9'):
            count2 = count2 + 1
            if (count2 <= 2):
                news = news + s[i]
            else:
                break
    news = news + "."
    pos1 = i
    count3 = 0;
    for i in range(pos1,n):
        if (s[i] >= '0') and (s[i] <= '9'):
            count3 = count3 + 1
            if (count3 <= 2):
                news = news + s[i]
            else:
                break
    return news
    
def checktime(s):
    '''
    check the format of the time feature including hour, minute, and second
    '''
    n = len(s)
    news = ''
    pos_m = s.rfind('"')
    if (pos_m != -1):
        news = news + s[0:2] + 'h' + s[pos_m-2:pos_m] + 'm'
    else:
        return 'error'
    pos_sec = s.find('.')
    if (pos_sec != -1):
        news = news + s[pos_sec-2:pos_sec] + '.' + s[pos_sec+1:pos_sec+3] + 's'
    return news

In [3]:
'''
This code cell contains the functions which extract corresponding information from the raw images
'''
def direction_and_height(path, file_name):
    '''
    extract the direction and height information from the raw images
    '''
    lena = mpimg.imread(path)
    
    lenb = lena[298:312,130:376, :]
    
    plt.imsave(file_name, lenb)
    
    pytesseract.pytesseract.tesseract_cmd ='E:\\Tesseract-OCR\\tesseract'
    result = pytesseract.image_to_string(file_name)
    
    result = result.replace(',','.')
    result = result.replace('..','.')
    result = result.replace('”','"')
    
    for i in range(len(result)):
        if ((result[i] >= '0') and (result[i] <= '9')) or (result[i] == '-'):
            break
    pos1_start = i
    for i in range(pos1_start,len(result)):
        if result[i] == '"':
            break
    pos1_end = i
    direction = result[pos1_start:pos1_end + 1]
    direction = direction.replace(' ','')
    direction = checkformat(direction)
    
    for i in range(pos1_end,len(result)):
        if ((result[i] >= '0') and (result[i] <= '9')) or (result[i] == '-'):
            break
    pos2_start = i
    for i in range(pos2_start,len(result)):
        if result[i] == '"':
            break
    pos2_end = i
    height = result[pos2_start:pos2_end + 1]
    height = height.replace(' ','')
    height = checkformat(height)
    return direction , height

def phase(path, file_name):
    '''
    extract the moon's phase information from the raw image
    '''
    lena = mpimg.imread(path)
    lenb = lena[338:353,155:196,:]
    plt.imsave(file_name,lenb)
    
    pytesseract.pytesseract.tesseract_cmd ='E:\\Tesseract-OCR\\tesseract'
    result = pytesseract.image_to_string(Image.open(file_name))
    result = result.replace(',','.')
    result = result.replace('..','.')
    result = result.replace(' ','')
    result = checkformat_phase(result)
    return result
    
def date(path, file_name):
    '''
    extract the date information from the raw images
    '''
    lena = mpimg.imread(path)
    lenb = lena[1414:1435,50:182,:]
    plt.imsave(file_name,lenb)
    
    pytesseract.pytesseract.tesseract_cmd ='E:\\Tesseract-OCR\\tesseract'
    result = pytesseract.image_to_string(Image.open(file_name))
    result = result.replace(',','.')
    result = result.replace('..','.')
    result = result.replace('”','"')
    try:
        day, time = result.split(' ')
        day = checkdate(day)
        time = checktime(time)
    except ValueError:
        day, time = (0, 0)
    return day , time
    
def apparent_moon(path, file_name):
    '''
    extract the apparent size of the moon from the raw images
    '''
    lena = mpimg.imread(path)
    lenb = lena[358:374,0:240, :]
    plt.imsave(file_name, lenb)
    pytesseract.pytesseract.tesseract_cmd ='E:\\Tesseract-OCR\\tesseract'
    result = pytesseract.image_to_string(file_name)
    result = result.replace(',','.')
    result = result.replace('..','.')
    result = result.replace('”','"')
    result = result.split(' ')[-1]
    result = checkformat(result)
    return result
    
def apparent_sun(path, file_name):
    '''
    extract the apparent size of the sun from the raw images
    '''
    lena = mpimg.imread(path)
    lenb = lena[418:434,0:240, :]
    plt.imsave(file_name, lenb)
    pytesseract.pytesseract.tesseract_cmd ='E:\\Tesseract-OCR\\tesseract'
    result = pytesseract.image_to_string(file_name)
    result = result.replace(',','.')
    result = result.replace('..','.')
    result = result.replace('”','"')
    result = result.split(' ')[-1]
    result = checkformat(result)
    return result

In [4]:
'''
This code cell runs over all raw images and extract data for Moon
'''
path = 'Moon\\'
f = open('Moon.dat', 'w+')
count = 0
for img in tqdm(os.listdir(path)):
    try:
        day, time = date(path + img, 'date_moon.png')
        result = phase(path + img, 'phase_moon.png')
        direction, height = direction_and_height(path + img, 'direction_moon.png')
        size = apparent_moon(path + img, 'apparent_moon.png')
    except UnicodeDecodeError:
        continue
    except TesseractError:
        continue
    
    # extract year, month, and day
    try:
        year = day.split('.')[0]
        month = day.split('.')[1]
        d = day.split('.')[2]
    except AttributeError:
        continue
    except IndexError:
        continue
        
    # extract hour, minute, and second
    try:
        hour = time.split('h')[0]
        m = time.split('h')[1].split('m')[0][0] + time.split('h')[1].split('m')[0][1]
        s = time.split('h')[1].split('m')[1].split('.')[0][0] + time.split('h')[1].split('m')[1].split('.')[0][1]
    except AttributeError:
        continue
    except IndexError:
        continue
    
    # extract the degree, minute, and second of the azimuthal angle phi
    try:
        degree_1 = direction.split('°')[0]
        minute_1 = direction.split('°')[1].split('\'')[0]
        second_1 = direction.split('°')[1].split('\'')[1].split('"')[0]
    except IndexError:
        continue
    
    # extract the degree, minute, and second of the polar angle theta
    try:
        degree_2 = height.split('°')[0]
        minute_2 = height.split('°')[1].split('\'')[0]
        second_2 = height.split('°')[1].split('\'')[1].split('"')[0]
    except IndexError:
        continue
    
    # extract the minute and second of the moon's apparent size
    try:
        size_minute = size.split('°')[1].split('\'')[0]
        size_second = size.split('°')[1].split('\'')[1].split('"')[0]
    except IndexError:
        continue
    
    # write the data to the file
    try:
        fig_index = int(img.split('f')[1].split('.')[0])
        f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(fig_index, year, month, d, hour, m, s, \
                                                          degree_1, minute_1, second_1, degree_2, minute_2, second_2, \
                                                                                          size_minute, size_second, result))
    except UnicodeEncodeError:
        continue
        
f.close()

100%|██████████████████████████████████████████████| 3010/3010 [1:14:33<00:00,  1.49s/it]
