In [1]:
%load_ext autoreload
%autoreload 2
import json
from gensim.summarization.bm25 import BM25
from gensim.models.phrases import Phrases, Phraser
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 1000)
import numpy as np
from multiprocessing import Pool
from functools import partial
import spacy


from bm25_retrieval import BM25Retrieval
from evaluation import average_precision

In [2]:
papers = pd.read_csv("../data/kit_expert_2017_papers.csv")
papers.head(3)

Unnamed: 0,id,abstract
0,2150066425,are we ready for autonomous driving the kitti vision benchmark suite today visual recognition systems are still rarely employed in robotics applications perhaps one of the main reasons for this is the lack of demanding benchmarks that mimic such scenarios in this paper we take advantage of our autonomous driving platform to develop novel challenging benchmarks for the tasks of stereo optical flow visual odometry slam and 3d object detection our recording platform is equipped with four high resolution video cameras a velodyne laser scanner and a state of the art localization system our benchmarks comprise stereo and optical flow image pairs stereo visual odometry sequences of km length and more than 200k 3d object annotations captured in cluttered scenarios up to cars and pedestrians are visible per image results from state of the art algorithms reveal that methods ranking high on established datasets such as middlebury perform below average when being moved outside the laboratory t...
1,2133151341,the hitran molecular spectroscopic database this paper describes the contents of the edition of the hitran molecular spectroscopic compilation the new edition replaces the previous hitran edition of and its updates during the intervening years the hitran molecular absorption compilation is composed of five major components the traditional line by line spectroscopic parameters required for high resolution radiative transfer codes infrared absorption cross sections for molecules not yet amenable to representation in a line by line form collision induced absorption data aerosol indices of refraction and general tables such as partition sums that apply globally to the data the new hitran is greatly extended in terms of accuracy spectral coverage additional absorption phenomena added line shape formalisms and validity moreover molecules isotopologues and perturbing gases have been added that address the issues of atmospheres beyond the earth of considerable note experimental ir cross se...
2,2115579991,vision meets robotics the kitti dataset we present a novel dataset captured from a vw station wagon for use in mobile robotics and autonomous driving research in total we recorded hours of traffic scenarios at hz using a variety of sensor modalities such as high resolution color and grayscale stereo cameras a velodyne 3d laser scanner and a high precision gps imu inertial navigation system the scenarios are diverse capturing real world traffic situations and range from freeways over rural areas to inner city scenes with many static and dynamic objects our data is calibrated synchronized and timestamped and we provide the rectified and raw image sequences our dataset also contains object labels in the form of 3d tracklets and we provide online benchmarks for stereo optical flow object detection and other tasks this paper describes our recording platform the data format and the utilities that we provide


In [6]:
spacy_parser = spacy.load('en', disable=['parser', 'ner'])

In [11]:
sent = "neural networks are the greatest"
tokens = spacy_parser(sent)
" ".join([token.lemma_ for token in tokens])

'neural network be the great'

In [129]:
with open("../data/kit_expert_2017_keywords.json", "r") as file:
    keywords = json.load(file)
keywords[0:2]

[{'keyword': 'liquid fluoride thorium reactor',
  'paper_ids': ['1527740619'],
  'level': 4,
  'keyword_id': '30972296'},
 {'keyword': 'energy engineering',
  'paper_ids': ['2056946625',
   '2070521924',
   '2254121180',
   '2280790071',
   '2593688536',
   '2626060881',
   '2759963175',
   '612980819'],
  'level': 3,
  'keyword_id': '520343842'}]

In [151]:
def handle_chunk(model, keywords):
    ap_list = []
    for keyword_info in keywords:
        query = keyword_info["keyword"].lower().split(" ")
        relevant_ids = keyword_info["paper_ids"]
        ranked_ids = model.get_ranked_documents(query)["id"]
        ap_list.append(average_precision(ranked_ids, relevant_ids))
    return ap_list
def mean_average_percision(model, keywords, n_jobs):
    pool = Pool(n_jobs)
    worker_function = partial(handle_chunk, model)
    work_chunks = np.array_split(keywords, n_jobs)
    ap_list = np.concatenate(pool.map(worker_function, work_chunks))
    return np.mean(ap_list)

In [152]:
ids = list(papers["id"])
corpus = list(papers["abstract"].str.split(" "))
models = [
    ("BM25 unigrams", partial(BM25Retrieval, use_bigrams=False)),
    ("BM25 bigrams", partial(BM25Retrieval, use_bigrams=True))
]

In [153]:
general_keywords = [k for k in keywords if k["level"]<=1]
specific_keywords = [k for k in keywords if k["level"]>=2]
test_sets = [("general keywords", general_keywords), ("specific_keywords", specific_keywords)]

In [154]:
for model_name, model_factory in models:
    model_instance = model_factory(ids, corpus)
    for test_set_name, test_set in test_sets:
        mAP = mean_average_percision(model_instance, test_set[:1000],2)
        print(model_name + " on " + test_set_name + " :" + str(mAP) + " mAP")

BM25 unigrams on general keywords :0.056651136157939215 mAP
BM25 unigrams on specific_keywords :0.5145989750461202 mAP
BM25 bigrams on general keywords :0.03751044645370631 mAP
BM25 bigrams on specific_keywords :0.48493162987748234 mAP


In [113]:
phrases = Phrases(corpus, min_count=1, threshold=10)

In [82]:
list(phrases[[["neural", "network"], ["kitti", "dataset"]]])

[['neural_network'], ['kitti_dataset']]

In [96]:
bm25_model = BM25(phrases[corpus])

In [200]:
query = phrases["kitti".split(" ")]
scores = bm25_model.get_scores(query)
sorted_score_indices = np.argsort(scores)[::-1]
papers["id"].iloc[]

In [283]:
def get_ranked_papers(papers, query):
    paper_ids = papers[["id"]]
    query = phrases[query.split(" ")]
    paper_ids["score"] = bm25_model.get_scores(query)
    paper_ids = paper_ids.sort_values(by="score", ascending=False)
    paper_ids = paper_ids[paper_ids["score"] > 0]
    return paper_ids
    
    

In [130]:
def get_top_n(query, n):
    scores = np.array(bm25_model.get_scores(phrases[query.split(" ")]))
    print(scores[np.argsort(scores)[-n:][::-1]])
    return (papers.iloc[np.argsort(scores)[-n:][::-1]])
get_top_n("kitti", 3)

[11.81999861 10.16513453  9.46965468]


Unnamed: 0,id,abstract
0,2150066425,are we ready for autonomous driving the kitti vision benchmark suite today visual recognition systems are still rarely employed in robotics applications perhaps one of the main reasons for this is the lack of demanding benchmarks that mimic such scenarios in this paper we take advantage of our autonomous driving platform to develop novel challenging benchmarks for the tasks of stereo optical flow visual odometry slam and 3d object detection our recording platform is equipped with four high resolution video cameras a velodyne laser scanner and a state of the art localization system our benchmarks comprise stereo and optical flow image pairs stereo visual odometry sequences of km length and more than 200k 3d object annotations captured in cluttered scenarios up to cars and pedestrians are visible per image results from state of the art algorithms reveal that methods ranking high on established datasets such as middlebury perform below average when being moved outside the laboratory t...
26609,2972567833,estimating object shape and movement using local occupancy grid maps abstract estimating motion and shape of surrounding objects reliably and accurately is a fundamental challenge in the study of interactions between cooperative traffic participants this paper proposes a new approach that utilizes free space information obtained from a lidar sensor and object local grid maps in order to simultaneously estimate the shape and movement state of objects with arbitrary shape we evaluated our approach in several simulated scenarios and found that the movement and shape estimation results are very close to the ground truth finally we did a qualitative evaluation on real data extracted from the kitti benchmark
1775,2100887497,road terrain detection avoiding common obstacle detection assumptions using sensor fusion obstacle detection is a fundamental task for advanced driver assistance systems adas and self driving cars several commercial systems like adaptive cruise controls and collision warning systems depend on them to notify the driver about a risky situation several approaches have been presented in the literature in the last years however most of them are limited to specific scenarios and restricted conditions in this paper we propose a robust sensor fusion based method capable of detecting obstacles in a wide variety of scenarios using a minimum number of parameters our approach is based on the spatial relationship on perspective images provided by a single camera and a 3d lidar experimental tests have been carried out in different conditions using the standard road kitti benchmark obtaining positive results


In [84]:
keywords = pd.read_csv("../data/kit_expert_2017_keywords.csv")
keywords.head()

Unnamed: 0,_id,keyword,paper_ids,level,keyword_id
0,124961601,conical surface,"['1970806100', '1984018685', '1996418911', '2006496599', '2032348195', '2038303896', '2040654611', '2059591544', '2071437655', '2228644465', '2330571978', '2414912487', '2791412389']",2,124961601
1,15122004,open learning,['2557711330'],4,15122004
2,2780556066,stereoelectronic effect,['2108143480'],2,2780556066
3,95942069,evolutionary arms race,['1980707088'],3,95942069
4,2777714038,simultaneous editing,['2756171889'],2,2777714038


In [262]:
test_data = keywords[keywords["level"] == 4].sample(1000, random_state=1)
test_data.head()

Unnamed: 0,_id,keyword,paper_ids,level,keyword_id
273,2781259782,uridine,['2323265909'],4,2781259782
17448,38087914,chain propagation,"['174634246', '195630349']",4,38087914
19408,138111711,double hashing,['2253629710'],4,138111711
19539,31447003,spline interpolation,"['2000239458', '2265976587', '591084388']",4,31447003
9450,125189844,inherent viscosity,['2065409933'],4,125189844


In [274]:
test_data["paper_ids"] = test_data["paper_ids"].apply(lambda row: json.loads(row.replace("'",'"')))

In [282]:
%%time
aps = []
for i, row in test_data.iterrows():
    query = row["keyword"]
    relevant_ids = row["paper_ids"]
    ranked_ids = get_ranked_papers(papers, query)["id"]
    aps.append(average_percision(ranked_ids, relevant_ids))
print(np.mean(aps))

CPU times: user 886 ms, sys: 128 ms, total: 1.01 s
Wall time: 995 ms


In [233]:
ranked_ids = get_top_n("east asia", 10)["id"]
relevant_ids=["289032619"]
ranked_ids

[9.67137199 8.84735745 7.56296978 7.56296978 6.91187491 6.71370959
 6.42217745 5.82328248 5.69349656 5.65909837]


16230    1966508755
1115     2803010032
4763     2145824827
7663     2145530725
10082    2607788411
8039     2102458273
23740    2972841205
22450    2044871596
7534     2787439179
13203    2102969650
Name: id, dtype: int64

0.0

In [156]:
ranked_ids = ranked_ids.reset_index(drop=True).reset_index()
ranked_ids["index"] += 1
ranked_ids

Unnamed: 0,index,id
0,1,2120361619
1,2,2010318474
2,3,2099375819
3,4,2171395629
4,5,2089032619


In [163]:
r = ranked_ids[ranked_ids["id"].isin(relevant_ids)].reset_index()
r

Unnamed: 0,level_0,index,id
0,0,1,2120361619
1,2,3,2099375819
