# Geosure: Experience reports

### Import required libraries

In [11]:
import json
import math
import os
import pandas as pd
import pywraps2 as s2

from glob import glob

### Get list of files with its path.

In [12]:
directory = '/code/geosure/reports/'
files_list = []
for path, folders, files in os.walk(directory):
    for file in files:
        files_list.append(os.path.join(path, file))

### Process data as JSON objects

This is useful for parsing necessary fields and future analysis

In [13]:
def include_brackets(data_str):
    if not data_str.startswith('{'):
        data_str = '{' + data_str
    if not data_str.endswith('}'):
        data_str = data_str + '}'
    return data_str


def process_file(file_path):
    # read file as string.
    with open(file_path, 'rb') as f:
        data = f.read().decode('utf-8')

    # There are some files which contains multiple reports.
    split_data = [include_brackets(report) for report in data.split('}{')]

    # Return as json objects.    
    json_data = [json.loads(rep) for rep in split_data]

    return json_data

json_objects = [process_file(file) for file in files_list]
# Flatten list of list.
json_objects = [item for sublist in json_objects for item in sublist]

print('Total number of reports: ', len(json_objects))
# print(json_objects[1])

Total number of reports:  553


Now, we will create a table using only location and sentiment values for each report

In [14]:
def create_dict(obj):
    return {
        'latitude': obj['geometry']['coordinates'][1],
        'longitude': obj['geometry']['coordinates'][0],
        'sentiment': obj['properties']['sentiment']
    }

reports = [create_dict(obj) for obj in json_objects]

 ### Load geosure latest master data file.

In [15]:
def check_row(row):
    if row['latitude'] > 200 or row['longitude'] > 200:
        return False
    if row['latitude'] < -200 or row['longitude'] < -200:
        return False
        
    return True


def load_df(df_path):
    fields = ['geosure_id', 'latitude', 'longitude']
    df = pd.read_csv(df_path, encoding='utf-16', sep='\t')
    df = df[fields]
    
    return list(df.T.to_dict().values())

dfs = [load_df(f) for f in glob('/code/geosure/mdf/*.txt')]
mdf = [item for sublist in dfs for item in sublist]

print('Length of Master Data File: ', len(mdf))

Length of Master Data File:  35660


### Associate reports with Geosure ids.

For each Geosure id, we create a search window whose center is the latitude and longitude coordinates. the length of the bounding box will be 5 km

In [16]:
earth_radius = 6371.01
distance = 5 #km
distdeg = (distance / earth_radius) * (180 / math.pi)
size = s2.S2LatLng_FromDegrees(distdeg, distdeg)

### Average weights

In [17]:
def average_weights(reports_array):
    total_weights = sum([i['weight'] for i in reports_array])

    return total_weights / len(reports_array)

def compute_weight(center, latlng):
    # Compute distance in km. Considering the earth as a sphere, corresponds to the arc length.
    dist = center.GetDistance(latlng).degrees() * earth_radius
    if dist == 0:
        weight = 1
    else:
        weight = 1 / distance
    
    return dist, weight

### Check  if report is within the created rectangle with center in the geosure_id coordinates

In [18]:
def check_report(report_latlng, rect):
    if rect.Contains(report_latlng):
        return True
    
    return False

### Count sentiment for positive and negative values

In [19]:
def count_sentiment(reports_array):
    pos = [r['sentiment'] for r in reports_array if r['sentiment'] == 'positive']
    neg = [r['sentiment'] for r in reports_array if r['sentiment'] == 'negative']

    return len(pos), len(neg)

In [20]:
def get_reports_array(report, center):
    reports_array = []
    report_latlng = s2.S2LatLng_FromDegrees(report['latitude'], report['longitude'])

    if rect.Contains(report_latlng):
        dist, weight = compute_weight(rect.GetCenter(), report_latlng)
        
        reports_array.append({'latitude': report['latitude'],
                              'longitude': report['longitude'],
                              'distance': dist,
                              'weight': weight,
                              'sentiment': report['sentiment']
        })
    return reports_array
        
        
results = dict()
for i, row in enumerate(mdf):
    if i % 10000 == 0:
        print("row: ", i)
    if not check_row(row):
        continue
    center = s2.S2LatLng_FromDegrees(row['latitude'], row['longitude'])
    rect = s2.S2LatLngRect_FromCenterSize(center, size)

    for report in reports:
        reports_array = get_reports_array(report, center)

        if len(reports_array) == 0:
            continue        

        # Count sentiment values.
        pos, neg = count_sentiment(reports_array)
        avg = average_weights(reports_array)

        results[row['geosure_id']] = {'reports': reports_array,
                                      'total_positive': pos,
                                      'total_negative': neg,
                                      'average': avg,
                                      'latitude': row['latitude'],
                                      'longitude': row['longitude']
                                     }

row:  0
row:  10000
row:  20000
row:  30000


In [24]:
results

{'Q930500026': {'reports': [{'latitude': 4.90143131700653,
    'longitude': 114.92685018045569,
    'distance': 120.70806479998866,
    'weight': 0.2,
    'sentiment': 'positive'}],
  'total_positive': 1,
  'total_negative': 0,
  'average': 0.2,
  'latitude': 4.8902779999999995,
  'longitude': 114.942222},
 'Q930500028': {'reports': [{'latitude': 41.71191816532232,
    'longitude': 44.783964000005085,
    'distance': 44.70337802941088,
    'weight': 0.2,
    'sentiment': 'positive'}],
  'total_positive': 1,
  'total_negative': 0,
  'average': 0.2,
  'latitude': 41.709981,
  'longitude': 44.792998},
 'Q930500210': {'reports': [{'latitude': 11.928501612504089,
    'longitude': 79.83405280694213,
    'distance': 153.62260908734612,
    'weight': 0.2,
    'sentiment': 'positive'}],
  'total_positive': 1,
  'total_negative': 0,
  'average': 0.2,
  'latitude': 11.913860000000001,
  'longitude': 79.814472},
 'Q930500544': {'reports': [{'latitude': 31.230405114364046,
    'longitude': 121.4737