## Setup
Simply define the month with the format "mmmYYYY"

In [1]:
month = 'Feb2022'
guild_raid = month.replace(' ', '') + '_guilds'
print(month)
print(guild_raid)

Feb2022
Feb2022_guilds


In [2]:
# Folder paths
directory = f'source/screenshots/{month}/{guild_raid}'
video_loc = f'source/videos'
portrait_loc = f'source/portraits'
output_loc = f'source/output/{month}'

## Other Scripts

In [3]:
%run utils/imports.ipynb
%run utils/screenshots.ipynb

In [4]:
print(f'{video_loc}/{month}')
print(f'{directory}')

source/videos/Feb2022
source/screenshots/Feb2022/Feb2022_guilds


## Extractor

In [5]:
def ultExtract(date, freq):
    
    start_time = time.time()
    get_frames(f'{video_loc}/{guild_raid}.mp4', directory, freq, 300)
    est_time = time.time() - start_time
    minutes = math.floor(est_time / 60)
    seconds = math.ceil(est_time % 60)
    print(f'Finished taking screenshots in: {minutes}m {seconds}s')
    
    ######################
    #     Extraction     #
    ######################
    
    def extract(file):
        temp_names = []
        temp_bosses = []
        temp_lvl = []
        temp_heroes = []
        
        screenshot = Image.open(f'{directory}/{file}.jpg')
        width, height = screenshot.size
        
        ##################
        #     Damage     #
        ##################
        dmg_left = 1600
        dmg_top = 300
        dmg_right = dmg_left + 300
        dmg_bottom = dmg_top + 500
        dmg = screenshot.crop((dmg_left, dmg_top, dmg_right, dmg_bottom))
        #display(dmg)
        dmg = np.array(dmg)
        gray_img = cv2.cvtColor(dmg, cv2.COLOR_BGR2GRAY)
        threshold_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
        details = pytesseract.image_to_data(threshold_img, lang='eng', output_type=Output.DATAFRAME)
        coord = details[['text', 'left', 'top']]
        coord = coord.dropna()
        # Try converting values to integers for easier handling
        def tryConvert(value):
            try:        
                value = int(re.sub('\W+', '', value))
            except:
                pass
            return(value)
        coord['text'] = coord['text'].apply(lambda x: tryConvert(x))
        
        def selectRows(value):
            if (len(str(value)) >= 9 and len(str(value)) <= 11) and isinstance(value, (int, np.integer)):
                return True
            else:
                return False
        coord = coord[coord[['text']].apply(lambda x: selectRows(x[0]), axis=1)]
        coord = coord.drop_duplicates(subset=['text']).reset_index(drop=True)
        #display(coord)
    
        for i in range(len(coord)):
        
            #################
            #     Names     #
            #################
            # The name_left is dependent on the number of digits in the damage hence the extra bit at the end
            name_left = dmg_left + coord['left'][i] - 860 - 30*(10 - len(str(coord['text'][i])))
            name_top = dmg_top + coord['top'][i] - 20
            name_right = name_left + 600
            name_bottom = name_top + 60
            cropped = screenshot.crop((name_left, name_top, name_right, name_bottom)).convert('RGB')
            #display(cropped)
            open_cv_img = np.array(cropped) 
            gray_img = cv2.cvtColor(open_cv_img, cv2.COLOR_BGR2GRAY)
            threshold_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
            details = pytesseract.image_to_data(threshold_img, lang='chi_sim', output_type=Output.DATAFRAME)
            names = details['text'].tolist()
            names = [n for n in names if str(n) != 'nan']
            #print(names)
            try:
                name = ''.join(names)
                temp_names.append(name)
            except:
                try:
                    temp_names.append(names[0])
                except:
                    temp_names.append(None)
        coord['name'] = temp_names
        return coord[['name', 'text']]
                    
    ###################
    #     Running     #
    ###################
        
    no_of_files = len(os.listdir(f'{directory}'))
    start_time = time.time()
    df = [extract('0')]
    est_time = time.time() - start_time
    total_est_time = est_time * no_of_files
    minutes = math.floor(total_est_time / 60)
    seconds = math.ceil(total_est_time % 60)
    
    print('Starting the extraction')
    print(f'Estimated time to completion: {minutes}m {seconds}s')

    bar = IntProgress(min=0, max=no_of_files-1) # instantiate the bar
    count = widgets.BoundedIntText(value=0, description=f'Out of {no_of_files-1}:')
    interface = HBox([bar, count])
    display(interface)
    for i in range(1, no_of_files):
        df.append(extract(f'{i}'))
        bar.value += 1
        count.value += 1
    
    return df

In [6]:
%%time
df = ultExtract(guild_raid, 0.1)

Finished taking screenshots in: 0m 14s
Starting the extraction
Estimated time to completion: 17m 7s


HBox(children=(IntProgress(value=0, max=165), BoundedIntText(value=0, description='Out of 165:')))

Wall time: 13min 7s


In [7]:
combined = df[0]
combined['frame'] = [0]*len(df[0])
for i in range(1,len(df)):
    temp = df[i]
    temp['frame'] = [i]*len(temp)
    combined = combined.append(temp).reset_index(drop=True)
combined.shape

(593, 3)

In [8]:
export = combined.drop_duplicates(subset=['text'], keep='first')
export = export.drop_duplicates(subset=['name'], keep='first')
export = export.sort_values(by=['text'], ascending=False)
export = export.reset_index(drop=True)
print(export.shape)
export.head()

(102, 3)


Unnamed: 0,name,text,frame
0,[潭拉斯圆家地理,11444130863,0
1,Vinter,11095306840,0
2,Vegemites,9590903718,0
3,MewMewCafe,8645840990,0
4,Dollars,7496756222,26


In [10]:
outputFolder = f'source/output/{month}'
try:
    if not os.path.exists(outputFolder): 
        os.makedirs(outputFolder)
except OSError: 
    print ('Error! Could not create a directory')
try:  
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)
except OSError: 
    print ('Error! Could not create a directory') 

print(f'source/output/{month}/{guild_raid}.xlsx')
export.to_excel(f'source/output/{month}/{guild_raid}.xlsx', index=True)

source/output/Feb2022/Feb2022_guilds.xlsx
