In [1]:
## AUTHOR : Narayanan Parthasarathy (pnarayanan@rocketmail.com)
## Evaluator program written as part of capstone project on  Image Captioning. Program used to calculate BLEU (n-gram), METEOR
## ROUGE (n-gram), SPICE metrics. 


# Input : (1) Actuals / Ground truth. In this case, it will be a text file / object containing all real captions with the image
# name as key. This will beb used as reference values for evaluaiton. This will come as separate dataset / file only for those 
# items in the predictions list(2) Predictions / candidates with 1 generated caption along with the image name as key. This  
# will be a separate dataset / file

# Output : For each prediction (/ for each image ID), the following scores will be calculated with 'predicted caption' Vs 'all
# reference captions' for that image. Finally these individual scores can be averages for the entire dataset.  
# Scores to be calculated :
# 1. Sentance BLUE - 1-gram, 2-gram, 3-gram, 4-gram [4 items]
# 2. METOR - 1 meteor score [1 item]
# 3. ROUGE  - rouge-1, rouge-2,rouge-3,rouge-4,rouge-l,rouge-w ; with each containing P,R,F1 scores. AVerage across all 
#    reference will be taken.
# ROUGE-N: Overlap of N-grams between the system and reference summaries
# ROUGE-L: Longest Common Subsequence (LCS)based statistics. Longest common subsequence problem takes into account sentence level structure similarity naturally and identifies longest co-occurring in sequence n-grams automatically.
# ROUGE-W: Weighted LCS-based statistics that favors consecutive LCSes .

In [None]:
## NOTE : This programs works best on a local computer & not ready for cloud yet - mainly due to the fact that the SPICE
## evaluatin depnds on standadford-nltk packahge & a local SPICE1.0 jar to be in the local. 

## Installation needed befreo you proceed in below lines: All commenetd, uncomment them/ run them from your command / shell.
# pip install nltk
# import nltk
# nltk.download() --> This downloads necessary files for nltk. 

# pip install py-rouge --> For ROUGE score

# Once all done, download the below zip file for nltk from http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip
# and extract the stanford-corenlp-3.6.0.jar & stanford-corenlp-3.6.0-models.jar in the lib folder 
# without the dependies of SPICE.jar, the spice metrics will not work properly. 

In [2]:
import pandas as pd
import numpy as np

In [3]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

In [4]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score

import rouge

In [5]:
from __future__ import division
import os
import sys
import subprocess
import threading
import json
import ast
import tempfile

In [6]:
# Path of the actual captions as well as prediction files. These will be hard coded for now, can be replaced / parameterized. 
ACTUALS_PATH = 'key_caption_master_final.txt' # path of actual/keyP_caption file. This is a text 
                                              #file with each line havign a caption in format "file_name | caption_text", separated by \n
PREDICTIONS_PATH ='XCEPTION_3GRU256_3E.csv'  # path of the generated caption in csv format. Each image will have 1 caption, with the same foirmat as above

In [7]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    #sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    #rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', cleantext)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 1 if not w in stopwords.words('english')]
    #stem_words=[stemmer.stem(w) for w in filtered_words]
    #lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

In [8]:
def get_all_eval_scores_1pred(img_name,pred_caption,actual_captions,rouge_ngram_count):      
# This function to calcuate the scores for every individul captions (per image), given the candidate caption & ground truth captions
# Used for alculating BLEU, METEOR & ROUGE scores. 
# For ROUGE, the calculations happen for n-grams in this routice but will be ignored for results.  Code is left to run to get the resulyts for analysis, if needed

    df_score = pd.DataFrame()
    
    df_score['image_name'] = [img_name]
    df_score['caption_count'] = [len(actual_captions)]
    candidate = pred_caption
    df_score['predict_caption'] = candidate   ## FOR DEBUG PURPOSE ; TO BE REMOVED LATER
    candidate_w_split = candidate[0].split()
    reference = []
    reference__w_split = []
    j=1
    for i in actual_captions:
        reference.append(i)
        df_score[f'actual_{j}'] = i
        j=j+1
        caption = i.split()
        reference__w_split.append(caption)

    # SENTENCE BLEU    
    df_score['S-BLEU-1']=[sentence_bleu(reference__w_split, candidate_w_split, weights=(1.0, 0, 0, 0))]
    df_score['S-BLEU-2']=[sentence_bleu(reference__w_split, candidate_w_split, weights=(0.5, 0.5, 0, 0))]
    df_score['S-BLEU-3']=[sentence_bleu(reference__w_split, candidate_w_split, weights=(0.3, 0.3, 0.3, 0))]
    df_score['S-BLEU-4']=[sentence_bleu(reference__w_split, candidate_w_split, weights=(0.25, 0.25, 0.25, 0.25))]      
  
    # METEOR
    df_score['METEOR'] = meteor_score(reference, candidate[0])
    
    #ROUGE , with Apply Average
    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=rouge_ngram_count,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=True,
                           apply_best=False,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
    rouge_scores = evaluator.get_scores(reference[0], candidate[0])
    for metric, results in sorted(rouge_scores.items(), key=lambda x: x[0]):        
        if metric == 'rouge-1' :
            df_score['rouge-1-p'] = results['p']
            df_score['rouge-1-r'] = results['r']
            df_score['rouge-1-f'] = results['f']
        elif metric == 'rouge-2':
            df_score['rouge-2-p'] = results['p']
            df_score['rouge-2-r'] = results['r']
            df_score['rouge-2-f'] = results['f']
        elif metric == 'rouge-3':
            df_score['rouge-3-p'] = results['p']
            df_score['rouge-3-r'] = results['r']
            df_score['rouge-3-f'] = results['f']
        elif metric == 'rouge-4':
            df_score['rouge-4-p'] = results['p']
            df_score['rouge-4-r'] = results['r']
            df_score['rouge-4-f'] = results['f']
        elif metric == 'rouge-l' :
            df_score['rouge-l-p'] = results['p']
            df_score['rouge-l-r'] = results['r']
            df_score['rouge-l-f'] = results['f']
        elif metric == 'rouge-w' :
            df_score['rouge-w-p'] = results['p']
            df_score['rouge-w-r'] = results['r']
            df_score['rouge-w-f'] = results['f']

    return df_score

In [9]:
def float_convert(obj):
        try:
          return float(obj)
        except:
          return np.nan

In [10]:
def get_spice_score_all_pred(input_data):
# Function to calculate spice score. Since this is an call to external java program, to reduce execution time / avoiding multipel calls
# this functions takes all candidate captions & their respective ground truths, convert them into a json format
# uses SPICE_JAR to calculate the scores & dumps the results in a file with json format. 
    #SPICE
    # Assumes spice.jar is in the same directory as spice.py.  Change as needed.
    SPICE_JAR = 'spice-1.0.jar'  # Location of SPICE-1.0.jar
    TEMP_DIR = 'tmp'             # location of temp dir
    CACHE_DIR = 'cache'          # location of cache dir
        
    cwd = "C:\\Users\\narayanan.p\\Documents\\01 Machine Learning\\Great Learning - AIML\\CAPSTONE\\CODE\\Evaluator\\"  
    temp_dir=os.path.join(cwd, TEMP_DIR)
    if not os.path.exists(temp_dir):
      os.makedirs(temp_dir)
    in_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir,mode='w')
    json.dump(input_data, in_file, indent=2)
    in_file.close()
    
    # Start job
    out_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
    out_file.close()
    cache_dir=os.path.join(cwd, CACHE_DIR)
    
    if not os.path.exists(cache_dir):
      os.makedirs(cache_dir)
    spice_cmd = ['java', '-jar', '-Xmx8G', SPICE_JAR, in_file.name, 
      '-out', out_file.name,      
      '-subset', '-silent'
    ]
    
    try:
        subprocess.check_call(spice_cmd)
    except subprocess.CalledProcessError as error:
        print(error)
       
    
    # Read and process results
    with open(out_file.name) as data_file:    
         results = json.load(data_file)
    # Uncomment the next two lines if you want the temp files to be deleted. Commented here so that the temp files can be used for analysis
    #os.remove(in_file.name)  
    #os.remove(out_file.name)
    
    dfcols =['image_name', 'spice_pr', 'spice_re','spice_f']
    df_spice_scores = pd.DataFrame(columns=dfcols)
   
    for item in results:
        df_spice_scores = df_spice_scores.append({'image_name':item['image_id'],'spice_pr':float_convert(item['scores']['All']['pr']),'spice_re':float_convert(item['scores']['All']['re']),'spice_f':float_convert(item['scores']['All']['f']) },ignore_index=True)
    
    return df_spice_scores

In [11]:
df_actuals = pd.read_csv(ACTUALS_PATH,sep='|',header = None)
df_predictions =pd.read_csv(PREDICTIONS_PATH,sep='|',header = None)
df_actuals.rename(columns={ 0: 'file_name', 1 : 'Caption'},inplace=True)
df_predictions.rename(columns={ 0: 'file_name', 1 : 'Caption'},inplace=True)
df_actuals.sort_values(by =['file_name'],ascending=True,inplace=True)
df_predictions.sort_values(by =['file_name'],ascending=True,inplace=True)

# Convert all to lower case, remove unwanted characters
df_actuals['Caption'] = df_actuals['Caption'].str.lower()
df_predictions['Caption'] = df_predictions['Caption'].str.lower()

df_actuals['Caption'] = df_actuals['Caption'].replace('[^a-zA-Z0-9 ]', '', regex=True)
df_predictions['Caption'] = df_predictions['Caption'].replace('[^a-zA-Z0-9 ]', '', regex=True)

In [12]:
## TEMP CODE FOR MATCHING PREDICTIONS TO KEY_CAPTIONS. Redundant caode, can be removed
df_predictions['file_name'] = df_predictions['file_name'].str.strip()
df_actuals['file_name'] = df_actuals['file_name'].str.strip()


keys = list(df_predictions.columns.values)
keys = keys[0]
i1 = df_actuals.set_index(keys).index
i2 = df_predictions.set_index(keys).index
df_actuals = df_actuals[i1.isin(i2)]

In [13]:
df_actuals = df_actuals.sort_values(by=['file_name'])

In [14]:
df_all_scores = pd.DataFrame()
rouge_ngram_count = 4 
spice_input_data = []
for i,j in df_predictions.iterrows():
    df_filter = df_actuals.loc[df_actuals['file_name'].str.strip() ==j[0].strip()]
    if df_filter.shape[0] != 0 :  # SKIP IF THE GENERATED CAPTION FILE NAME IS INCORRECT / NO MATCHES FOUND IN ACTUALS
        img_name = j[0].strip()
        pred_caption = []
        pred_caption.append(j[1].strip())
        actual_caption = []
        for a,b in df_filter.iterrows():        
            actual_caption.append(b[1].strip())
        
        #print(actual_caption)
        df_score=get_all_eval_scores_1pred(img_name,pred_caption,actual_caption,rouge_ngram_count)
        df_all_scores = pd.concat([df_all_scores,df_score],ignore_index=True)

        spice_input_data.append({
                  "image_id" : img_name,
                  "test" : pred_caption[0],
                  "refs" : actual_caption
                })

df_spice_scores = get_spice_score_all_pred(spice_input_data)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  app.launch_new_instance()


In [15]:
# CREATE XL WITH SUBSET OF COLUMNS FROM THE DF_ALL_SCORES AS WELL AS CALCULATE THE AVERAGES.
df_scores_report = pd.DataFrame()
df_scores_report['image_name'] = df_all_scores['image_name']
df_scores_report['caption_count'] = df_all_scores['caption_count']
df_scores_report['BLEU-1'] = df_all_scores['S-BLEU-1']
df_scores_report['BLEU-2'] = df_all_scores['S-BLEU-2']
df_scores_report['BLEU-3'] = df_all_scores['S-BLEU-3']
df_scores_report['BLEU-4'] = df_all_scores['S-BLEU-4']
df_scores_report['METEOR'] = df_all_scores['METEOR']
df_scores_report['ROUGE_precision'] = df_all_scores['rouge-l-p']
df_scores_report['ROUGE_recall'] = df_all_scores['rouge-l-r']
df_scores_report['ROUGE_FScore'] = df_all_scores['rouge-l-f']

df_scores_report = pd.merge(df_scores_report,df_spice_scores,on='image_name')

In [16]:
# Create output file names basis the input predictiosn files
base_name = PREDICTIONS_PATH.split('.')[0]
Output_Scores = base_name+ '_SCORES_ALL.csv'
Output_Summary = base_name+ '_SCORES_SUMMARY.csv'

In [17]:
df_scores_report.to_csv(Output_Scores)

In [18]:
df_scores_summary = df_scores_report.mean(axis=0)
df_scores_summary['caption_count'] = df_predictions.shape[0]
df_scores_summary.to_csv(Output_Summary)

  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
print("DONE") # Print Done when complete. Added here since the SPICE calculation will take more than 20 mins on a core i3 laptop.

DONE
