# Austin tutorial.
In this tutorial we will use a community survey from Austin Texas....

## 1. KPA on 2016 random 1000 senetences.

### 1.A Read random sample of 1000 sentences from 2016 comments.

In [None]:
import csv
import random


with open('./dataset_austin_sentences.csv') as csv_file:
    reader = csv.DictReader(csv_file)
    sentences = list(reader)
        
sentences_2016 = list(filter(lambda sentence: sentence['Year'] == '2016', sentences))
random.seed(0)
random_sample_sentences_2016 = random.sample(sentences_2016, 1000)

### 1.B Run KPA on the random sample

In [None]:
from debater_python_api.api.debater_api import DebaterApi
from debater_python_api.api.clients.keypoints_client import KpAnalysisUtils 


KpAnalysisUtils.init_logger()
api_key = '<api-key>'
debater_api = DebaterApi(apikey=api_key)
keypoints_client = debater_api.get_keypoints_client()
domain = 'kp_based_survey_example'

In [None]:
def run_kpa(sentences):
    sentences_texts = [sentence['texts'] for sentence in sentences]
    sentences_ids = [sentence['ids'] for sentence in sentences]

    keypoints_client.upload_comments(domain=domain,
                                     comments_ids=sentences_ids,
                                     comments_texts=sentences_texts,
                                     dont_split=True)

    keypoints_client.wait_till_all_comments_are_processed(domain)

    future = keypoints_client.start_kp_analysis_job(domain=domain, comments_ids=sentences_ids,
                                                    run_params={'n_top_kps': 20})

    kpa_result = future.get_result(high_verbosity=True, polling_timout_secs=5)
    return kpa_result

In [None]:
from austin_utils import print_results

kpa_result_random_1000_2016 = run_kpa(random_sample_sentences_2016)
print_results(kpa_result_random_1000_2016, n_sentences_per_kp=2, title='Random sample 2016')

### Improve coverage by taking highest quality sentences
bla bla

In [None]:
arg_quality_client = debater_api.get_argument_quality_client()

arg_quality_scores = arg_quality_client.run(
            [{'sentence': sentence['texts'], 'topic': 'Austin'} for sentence in sentences_2016])
sorted_aq_sentences_2016 = [sentence for _, sentence in sorted(zip(arg_quality_scores, sentences_2016), key=lambda x: x[0], reverse=True)]
top_aq_sentences_2016 = sorted_aq_sentences_2016[:1000]

!!!!Show top/lower AQ sentences

In [None]:
from austin_utils import split_sentences_to_lines

top_10_sentences = sorted_aq_sentences_2016[:10]
top_10_sentences = [sentence['texts'] for sentence in top_10_sentences]
print('top_10_sentences: ')
print('\n'.join(split_sentences_to_lines(top_10_sentences, 1)))

bottom_10_sentences = sorted_aq_sentences_2016[-10:]
bottom_10_sentences = [sentence['texts'] for sentence in bottom_10_sentences]
print('\n\nbottom_10_sentences: ')
print('\n'.join(split_sentences_to_lines(bottom_10_sentences, 1)))

In [None]:
kpa_result_top_aq_1000_2016 = run_kpa(top_aq_sentences_2016)
print_results(kpa_result_top_aq_1000_2016, n_sentences_per_kp=2, title='Top aq 2016')

In [None]:
def run_kpa(sentences):
    sentences_texts = [sentence['texts'] for sentence in sentences]
    sentences_ids = [sentence['ids'] for sentence in sentences]

    keypoints_client.upload_comments(domain=domain,
                                     comments_ids=sentences_ids,
                                     comments_texts=sentences_texts,
                                     dont_split=True)

    keypoints_client.wait_till_all_comments_are_processed(domain)

    future = keypoints_client.start_kp_analysis_job(domain=domain, comments_ids=sentences_ids,
                                                    run_params={'n_top_kps': 20, 
                                                                'clustering_threshold': 0.95, 
                                                                'mapping_threshold': 0.95})

    kpa_result = future.get_result(high_verbosity=True, polling_timout_secs=5)
    return kpa_result, future.get_job_id()

In [None]:
kpa_result_top_aq_1000_2016, kpa_top_aq_1000_2016_job_id = run_kpa(top_aq_sentences_2016)
print_results(kpa_result_top_aq_1000_2016, n_sentences_per_kp=2, title='Top aq 2016')

show one KP: top 5 matches bottom 5 matches

In [None]:
from austin_utils import print_top_and_bottom_matches_for_kp


print_top_and_bottom_matches_for_kp(kpa_result_top_aq_1000_2016, 'Traffic congestion needs major improvement', 5, 5)

In [None]:
sentences_2017 = list(filter(lambda sentence: sentence['Year'] == '2017', sentences))
arg_quality_scores = arg_quality_client.run(
            [{'sentence': sentence['texts'], 'topic': 'Austin'} for sentence in sentences_2017])
sorted_aq_sentences_2017 = [sentence for _, sentence in sorted(zip(arg_quality_scores, sentences_2017), key=lambda x: x[0], reverse=True)]
top_aq_sentences_2017 = sorted_aq_sentences_2017[:1000]

In [None]:
def run_kpa(sentences, key_points_by_job_id=None):
    sentences_texts = [sentence['texts'] for sentence in sentences]
    sentences_ids = [sentence['ids'] for sentence in sentences]

    keypoints_client.upload_comments(domain=domain,
                                     comments_ids=sentences_ids,
                                     comments_texts=sentences_texts,
                                     dont_split=True)

    keypoints_client.wait_till_all_comments_are_processed(domain)

    future = keypoints_client.start_kp_analysis_job(domain=domain, comments_ids=sentences_ids,
                                                    run_params={'n_top_kps': 20, 
                                                                'clustering_threshold': 0.95, 
                                                                'mapping_threshold': 0.95},
                                                    key_points_by_job_id=key_points_by_job_id)

    kpa_result = future.get_result(high_verbosity=True, polling_timout_secs=5)
    return kpa_result, future.get_job_id()

In [None]:
kpa_result_top_aq_1000_2017, _ = run_kpa(top_aq_sentences_2017, kpa_top_aq_1000_2016_job_id)
print_results(kpa_result_top_aq_1000_2017, n_sentences_per_kp=2, title='Top aq 2017, using 2016 key points')

In [None]:
from austin_utils import compare_results

compare_results(kpa_result_top_aq_1000_2016, '2016', kpa_result_top_aq_1000_2017, '2017')

drill down into traffic

In [None]:
def get_sentences_to_annotation(sentences):
    term_wikifier_client = debater_api.get_term_wikifier_client()
    sentence_to_annotations = {}
    annotations_list = term_wikifier_client.run(sentences)
    for sentence, annotations in zip(sentences, annotations_list):
        sentence_to_annotations[sentence] = set([annotation['concept']['title'] for annotation in annotations])
    return sentence_to_annotations

In [None]:
sentences_2016_texts = [sentence['texts'] for sentence in sentences_2016]
sentence_to_annotations = get_sentences_to_annotation(sentences_2016_texts)
all_annotations = [annotation for sentence in sentence_to_annotations 
                   for annotation in sentence_to_annotations[sentence]]
all_annotations = sorted(list(set(all_annotations)))

In [None]:
concept = 'traffic'
threshold = 0.5
term_relater_client = debater_api.get_term_relater_client()
concept_annotation_pairs = [[concept, annotation] for annotation in all_annotations]
scores = term_relater_client.run(concept_annotation_pairs)
matched_annotations = [annotation for annotation, score in zip(all_annotations, scores) if score > threshold]

In [None]:
# print('Top 5 matched annotations:')
# print('\n'.join(split_sentences_to_lines(top_10_sentences, 1)))
print(matched_annotations)

In [None]:
matched_sentences_texts = [sentence for sentence in sentences_2016_texts 
                     if len(sentence_to_annotations[sentence].intersection(matched_annotations)) > 0]
print('Running over %d sentences' % len(matched_sentences_texts))

In [None]:
matched_sentences = [sentence for sentence in sentences_2016 if sentence['texts'] in matched_sentences_texts]

In [None]:
kpa_result_traffic_2016, _ = run_kpa(matched_sentences, None)
print_results(kpa_result_traffic_2016, n_sentences_per_kp=2, title='Traffic KPA 2016')