In [29]:
from statistics import correlation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

## Metrics to use
Separate metrics into top 1 and top 5 category
Calculate accuracy precision and recall for both categories. Also add F1 score

In [30]:
pred_df = pd.read_csv('data/pred_results.csv')
correlations = pd.read_csv('data/correlations.csv')

In [31]:
content_to_topic = {'content_id': [], 'topic_id': []}
for i, row in correlations.iterrows():
    for content_id in row['content_ids'].split(' '):
        content_to_topic['content_id'].append(content_id)
        content_to_topic['topic_id'].append(row['topic_id'])
content_to_topic = pd.DataFrame(content_to_topic)
content_to_topic = content_to_topic[content_to_topic['content_id'].isin(pred_df['content_id'])]

In [32]:
pred_df = pred_df.set_index(['topic_id', 'content_id'])
content_to_topic['label'] = 1
content_to_topic = content_to_topic.set_index(['topic_id', 'content_id'])

## TOP 1

In [33]:
content_to_topic.reset_index()['content_id'].value_counts()

content_id
c_8ad113a7a801    24
c_15a634632521    14
c_04fbee306e50    13
c_b4429618cba1    13
c_76a17cfb9d87    12
                  ..
c_a662335f86a4     1
c_e28009ea492f     1
c_c733b0afd4f8     1
c_9022950ceacc     1
c_10157e74fc3a     1
Name: count, Length: 1000, dtype: int64

In [34]:
top_1 = pred_df.reset_index().groupby('content_id')['topic_id'].first()
joined = content_to_topic.join(top_1, on=['content_id',], how='left')
joined = joined.rename(columns={'topic_id': 'pred_topic'})
joined = joined.reset_index()
joined['label'] = (joined['topic_id'] == joined['pred_topic']) * 1.0

top1_prec = (joined.groupby('content_id')['label'].mean() > 0).mean()
print('Top 1 Precision', top1_prec) # No point in recall as we predict only 1 topic.
random_prec = (content_to_topic.groupby('content_id')['label'].count() / len(correlations)).mean()
print('Random precision (baseline)', random_prec)
#

Top 1 Precision 0.232
Random precision (baseline) 2.9617829217939747e-05


## TOP 5 metrics against all topic content

In [35]:
pred_df

Unnamed: 0_level_0,Unnamed: 1_level_0,title,description,channel,category,level,language,parent,has_content,score
topic_id,content_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
t_107d6bc83f12,c_fae000af5570,Work-energy theorem,,0ec697,source,4,en,t_48cea89450d0,True,5
t_1379f1d82041,c_fae000af5570,Work-energy theorem,,2ee29d,aligned,4,en,t_532cdfa2085c,True,4
t_5de671c07375,c_fae000af5570,4.2 Work-energy theorem,,e77b55,aligned,4,en,t_45dbcf35a31b,True,3
t_a57df0b7f393,c_fae000af5570,Work-energy theorem (Hindi),,2ee29d,aligned,4,en,t_38dd668b2752,True,2
t_c899a3e24e42,c_fae000af5570,5.3 kinetic energy and the work Energy theorem,,e77b55,aligned,4,en,t_2f20cf4e8f05,True,1
...,...,...,...,...,...,...,...,...,...,...
t_dc8fe1fdb261,c_e97b2c4973ed,Cognitive Biases,,8ca895,source,1,en,t_37f833079be5,True,5
t_6b4b78e932ca,c_e97b2c4973ed,Reducing Bias in Sampling,,fef095,source,4,en,t_17b5a2cc59de,True,4
t_b7f1fbdec838,c_e97b2c4973ed,8: Optimization,8: Optimization,88c9d6,supplemental,4,en,t_5308c71c9239,True,3
t_4213f7fa5331,c_e97b2c4973ed,Positive Attitudes: Rights and Obligations,,bae253,supplemental,2,en,t_2a7d947c6b95,True,2


In [36]:
content_to_topic

Unnamed: 0_level_0,Unnamed: 1_level_0,label
topic_id,content_id,Unnamed: 2_level_1
t_000d1fb3f2f5,c_76a17cfb9d87,1
t_002ff3ee9cb1,c_f87ca039ff13,1
t_00910415ee1a,c_b4429618cba1,1
t_00ce57bb6240,c_42575fcc21f2,1
t_00f9a52a69aa,c_10157e74fc3a,1
...,...,...
t_ff8213583bd0,c_668a9ffa7fad,1
t_ff86e0122954,c_ad2699e9a45a,1
t_ff87307291b6,c_1cf57eff18f1,1
t_ff9237c2cbaa,c_c6a0dea8557e,1


In [37]:
from sklearn.metrics import classification_report

true = content_to_topic.index
pred = pred_df.index
recall = true.isin(pred).mean()
precision = pred.isin(true).mean()
f1 = 2 * precision * recall / (precision + recall)
print('Global metrics')
print('Recall:', recall)
print('Precision:', precision)
print('F1:', f1)

Global metrics
Recall: 0.2650933040614709
Precision: 0.0966
F1: 0.14160070360598065
