In [46]:
import pandas as pd
import csv, requests, os, re, itertools, googlemaps, codecs
from tqdm import tqdm
from time import sleep
from fuzzywuzzy import fuzz
import simplejson as json

In [47]:
airports = pd.read_csv("airports.csv", header=None)
airrights = pd.read_csv("flight_rights.tsv", sep="\t")
treaties = pd.read_csv("treaties.tsv", sep="\t")
cities = pd.read_csv('cities_en.tsv', sep='\t')
countries = pd.read_csv('countries_en.tsv', sep='\t')
schedules = pd.read_csv('allschedules.tsv', sep='\t')

In [48]:
airports = airports.loc[:, [2, 3, 4]]
airports.columns = ['encity', 'encountry', 'icao_airport']

In [49]:
repl_city = dict(cities.loc[:, ["ukcity", "encity"]].values.tolist()[0::1])
repl_country = dict(countries.loc[:, ["ukcountry", "encountry"]].values.tolist()[0::1])

### replace cities

In [50]:
treaties['enpoints_ua'] = treaties['points_ua']
treaties['enpoints_foreign'] = treaties['points_foreign']
airrights['from_encity'] = airrights['from_city']
airrights['to_encity'] = airrights['to_city']

for uk, en in repl_city.items():
    treaties.enpoints_ua = treaties.enpoints_ua.str.replace(uk, en)
    treaties.enpoints_foreign = treaties.enpoints_foreign.str.replace(uk, en)
    airrights.from_encity = airrights.from_encity.str.replace(uk, en)
    airrights.to_encity = airrights.to_encity.str.replace(uk, en)
    
treaties.enpoints_ua = treaties.enpoints_ua.str.replace('всі', 'all')
treaties.enpoints_foreign = treaties.enpoints_foreign.str.replace('всі', 'all')

### replace countries

In [51]:
treaties['encountry'] = treaties['country']
airrights['encountry'] = airrights['country']

for uk, en in repl_country.items():
    treaties.encountry = treaties.encountry.str.replace(uk, en)
    airrights.encountry = airrights.encountry.str.replace(uk, en)

treaties.encountry = treaties.encountry.str.replace('[^A-Za-z -]', '').str.strip()
airrights.encountry = airrights.encountry.str.replace('[^A-Za-z -]', '').str.strip()
treaties.encountry = treaties.encountry.str.replace('\s+', ' ').str.strip()
airrights.encountry = airrights.encountry.str.replace('\s+', ' ').str.strip()

In [52]:
# add urls to treaties

treaty_urls = pd.read_csv('urls.csv', encoding='cp1251', sep=';')
treaties = treaties.merge(treaty_urls, on='country', how='left')

## replace icao codes of airports

In [53]:
repl_aircity = dict(airports.loc[:, ["icao_airport", "encity"]].values.tolist()[0::1])
repl_aircountry = dict(airports.loc[:, ["icao_airport", "encountry"]].values.tolist()[0::1])

In [54]:
schedules['encity_from'] = schedules['route_from']
schedules['encity_to'] = schedules['route_to']
schedules['encountry_from'] = schedules['route_from']
schedules['encountry_to'] = schedules['route_to']

for icao, en in repl_aircity.items():
    if isinstance(icao, float) or isinstance(en, float):
        continue
    schedules.encity_from = schedules.encity_from.str.replace(icao, en)
    schedules.encity_to = schedules.encity_to.str.replace(icao, en)
    
for icao, en in repl_aircountry.items():
    if isinstance(icao, float) or isinstance(en, float):
        continue
    schedules.encountry_from = schedules.encountry_from.str.replace(icao, en)
    schedules.encountry_to = schedules.encountry_to.str.replace(icao, en)

In [55]:
schedules = schedules.loc[(pd.notnull(schedules.encity_from) &
                           pd.notnull(schedules.encity_to) &
                           pd.notnull(schedules.encountry_from) &
                           pd.notnull(schedules.encountry_to)), ]

## check if there are duplicated city names

In [56]:
cities = (treaties.encities_ua.values.tolist() +
          treaties.encities_foreign.values.tolist() +
          airrights.from_encity.values.tolist() +
          airrights.to_encity.values.tolist() +
          schedules.encity_from.values.tolist() +
          schedules.encity_to.values.tolist())

cities = [city.split("|") for city in cities
          if city != "all" and not isinstance(city, float)]

cities = list(set([city.strip() for city in itertools.chain.from_iterable(cities)]))

AttributeError: 'DataFrame' object has no attribute 'encities_ua'

In [None]:
# for c1, c2 in itertools.combinations(cities, 2):
#     match = fuzz.token_set_ratio(c1, c2)
#     if match > 60:
#         print(c1, c2, match)

In [None]:
countries = (treaties.encountry.values.tolist() +
             airrights.encountry.values.tolist() +
             schedules.encountry_from.values.tolist() +
             schedules.encountry_to.values.tolist())

countries = list(set(countries))

In [None]:
# for c1, c2 in itertools.combinations(countries, 2):
#     match = fuzz.token_set_ratio(c1, c2)
#     if match > 60:
#         print(c1, c2, match)

In [None]:
airrights.to_csv('flight_rights_unicities.tsv', sep='\t', index=False, encoding='utf-8')
treaties.to_csv('treaties_unicities.tsv', sep='\t', index=False, encoding='utf-8')
schedules.to_csv('allschedules_unicities.tsv', sep='\t', index=False, encoding='utf-8')

In [34]:
print(treaties.columns, '\n',
      airrights.columns, '\n',
      schedules.columns, '\n')

Index(['country', 'stage', 'date', 'fw_gen', 'fw_route', 'fw_airline', 'text',
       'cities_ua', 'cities_foreign', 'notfull', 'comments', 'airlines_total',
       'airlines_route', 'encities_ua', 'encities_foreign', 'encountry',
       'url'],
      dtype='object') 
 Index(['date', 'right_id', 'type_of_right', 'airline_name', 'icao_airline',
       'iata_airline', 'country', 'from_city', 'to_city', 'min_freq',
       'max_freq', 'valid_from', 'valid_till', 'annul_date', 'given_by',
       'given_date', 'annul_reason', 'from_encity', 'to_encity', 'encountry'],
      dtype='object') 
 Index(['airline', 'days_week', 'flight', 'route_from', 'route_to',
       'valid_from', 'valid_till', 'encity_from', 'encity_to',
       'encountry_from', 'encountry_to'],
      dtype='object') 



In [17]:
treaties.columns = ['country', 'stage', 'date', 'iata_season', 'fw_gen', 'fw_route',
                    'fw_airline', 'text', 'cities_ua', 'cities_foreign', 'nua_cities',
                    'nforeign_cities', 'notfull', 'comments', 'airlines_total',
                    'airlines_route', 'encities_ua', 'encities_foreign', 'encountry']

airrights.columns = ['date', 'right_id', 'type_of_right', 'airline_name', 'icao_airline',
                     'iata_airline', 'country', 'from_city', 'to_city', 'min_freq',
                     'max_freq', 'valid_from', 'valid_till', 'annul_date', 'given_by',
                     'given_date', 'annul_reason', 'from_encity', 'to_encity', 'encountry']

# Huge json

In [19]:
airrights = pd.read_csv('flight_rights_unicities.tsv', sep='\t')
treaties = pd.read_csv('treaties_unicities.tsv', sep='\t')
schedules = pd.read_csv('allschedules_unicities.tsv', sep='\t')
treaties.replace({'|': '--'}, inplace=True)


print(treaties.columns, '\n\n',
      airrights.columns, '\n\n',
      schedules.columns, '\n\n')

countries = (treaties.encountry.values.tolist() +
             airrights.encountry.values.tolist() +
             schedules.encountry_from.values.tolist() +
             schedules.encountry_to.values.tolist())

countries = sorted(list(set(countries)), key=str.lower)

In [20]:
country_dicts = []

for country in countries:
    if country == 'Ukraine':
        continue
    
    treaties_c = treaties.loc[treaties.encountry == country, :].fillna('')
    airrights_c = airrights.loc[airrights.encountry == country, :].fillna('')
    schedules_c = schedules.loc[((schedules.encountry_from == country) |
                                 (schedules.encountry_to == country)), :].fillna('')

    # make the most restricted conditions in treaty appear first, and condition for all - last.
    # otherwise all given rights will belong to the last condition for all rest points
    treaties_c['sort'] = (treaties_c.encities_ua == 'all').astype(int)
    treaties_c['sort'] += (treaties_c.encities_foreign == 'all').astype(int)
    treaties_c.sort_values(by='sort', inplace=True)
    
    # initialize country dictionary
    country_dict = {
        'encoutry': country,
        'treaty':{
            'conditions': []
        }
    }
    
    country_dict['treaty']['uacountry'] = treaties_c.country.values.tolist()[0]
    country_dict['treaty']['date'] = treaties_c.date.values.tolist()[0]
    #country_dict['treaty']['url'] = condition['url'] # 
    country_dict['treaty']['stage'] = treaties_c.stage.values.tolist()[0]

    chosen_ids = [] # to prevent more specified conditions from being selected to general ones

    # Find flight permissions for every treaty
    for i, condition in enumerate(treaties_c.to_dict(orient='records')):
        
#     new condition dict
#         scopes: 
#             - condition: general limit of flights within condition
#             - point: limit of flights to each point
#             - airline: limit of flights for every airline
            
        new_condition = {
            'ua_from': condition['cities_ua'],
            'en_from': condition['encities_ua'],
            'ua_to': condition['cities_foreign'],
            'en_to': condition['encities_foreign'],
            'comment': condition['comments'],
            'flight_limits': [],
            'airline_limits': [],
            'permissions': []
        }
        
        # Add points as other or all
        
        if condition['airlines_total'] != 999:
            new_condition['airline_limits'].append({'limit': condition['airlines_total'], 'scope': 'condition'})
            
        if (condition['airlines_route'] != 999
            and condition['airlines_route'] != condition['airlines_total']):
            new_condition['airline_limits'].append({'limit': condition['airlines_route'], 'scope': 'point'})
            
        if condition['fw_gen'] != 999:
            new_condition['flight_limits'].append({'limit': condition['fw_gen'], 'scope': 'condition'})
            
        if (condition['fw_route'] != 999
            and condition['fw_route'] != condition['fw_gen']):
            new_condition['flight_limits'].append({'limit': condition['fw_route'], 'scope': 'point'})
            
        if (condition['fw_airline'] != 999
            and condition['fw_airline'] != condition['fw_gen']
            and condition['fw_airline'] != condition['fw_route']):
            new_condition['flight_limits'].append({'limit': condition['fw_airline'], 'scope': 'airline'})
        
        
        encities_ua = condition['encities_ua'] if condition['encities_ua'] != 'all' else '.'
        encities_foreign = condition['encities_foreign'] if condition['encities_foreign'] != 'all' else '.'

        condition_rights = airrights_c.loc[ airrights_c.from_encity.str.contains(encities_ua) &
                                            airrights_c.to_encity.str.contains(encities_foreign) &
                                            ~ airrights_c.right_id.isin(chosen_ids), ] 
        
        # add chosen rights to list of all chosen  fly permissions
        chosen_ids += condition_rights.right_id.values.tolist()

        condition_rights = condition_rights.to_dict(orient='records')

        # from schedules select flights, that are made according to this rule and fill to dict
        for i, right in enumerate(condition_rights):
            right_schedule = schedules_c.loc[((schedules_c.airline.str.contains(right['iata_airline']) |
                                                schedules_c.airline.str.contains(right['icao_airline'])) &
                                               (schedules_c.encity_from.str.contains(right['from_encity']) |
                                                schedules_c.encity_from.str.contains(right['to_encity'])) &
                                               (schedules_c.encity_to.str.contains(right['from_encity']) |
                                                schedules_c.encity_to.str.contains(right['to_encity']))), ]

            # add info about total # of route flights and flight numbers within this right
            if len(list(set(right_schedule.flight.values.tolist()))) > 0:
                condition_rights[i]['schedules'] = {
                    'freq': right_schedule.days_week.apply(lambda x: len(str(x))).drop_duplicates().sum(),
                    'flights': list(set(right_schedule.flight.values.tolist()))
                }

        new_condition['permissions'] = condition_rights
                
        #! add conditions to countrydict
        country_dict['treaty']['conditions'].append(new_condition)

    # add info about treaty    
    

    
    country_dicts.append(country_dict)

{'encountry': 'China',
 'flight_rights': [{'airline_name': 'МАУ',
   'annul_date': '',
   'annul_reason': '',
   'country': 'Китайська Народна Республіка',
   'date': '01.09.2016',
   'encountry': 'China',
   'from_city': 'Київ',
   'from_encity': 'Kiev',
   'given_by': 'Протокол засідання комісії ДАС № 16 від 30.08.2016',
   'given_date': '01.09.2016',
   'iata_airline': 'PS',
   'icao_airline': 'AUI',
   'max_freq': '3',
   'min_freq': 0,
   'right_id': 1524,
   'to_city': 'Шанхай',
   'to_encity': 'Shanghai',
   'type_of_right': 'Регулярні міжнародні',
   'valid_from': '01.09.2016',
   'valid_till': '01.09.2066'},
  {'airline_name': 'ЗЕТАВІА',
   'annul_date': '',
   'annul_reason': '',
   'country': 'Китайська Народна Республіка',
   'date': '09.12.2015',
   'encountry': 'China',
   'from_city': 'Одеса',
   'from_encity': 'Odessa',
   'given_by': 'Протокол засідання комісії ДАС №7 від 09.12.2015',
   'given_date': '11.12.2015',
   'iata_airline': '',
   'icao_airline': '',
   'max_

In [None]:
#with codecs.open('C:/Users/Nadya/Desktop/aviaroutes/avia_all.json', 'w', encoding='utf-8') as f:
#    f.write(json.dumps(huge_json))

# Unify city names, everything in English

In [38]:
gmaps = googlemaps.Client(key="")

In [None]:
cities = (treaties.cities_ua.values.tolist() +
          treaties.cities_foreign.values.tolist() +
          airrights.from_city.values.tolist() +
          airrights.to_city.values.tolist())

cities = [city.split("|") for city in cities
          if city != "всі" and not isinstance(city, float)]

cities = [city for city in itertools.chain.from_iterable(cities)]

cities = pd.DataFrame(list(set(cities)), columns=['ukcity'])
cities['encity'] = None
cities['country'] = None

In [None]:
for i, city in tqdm(cities.iterrows()):
    result = gmaps.geocode(city['ukcity'])
    if len(result) > 0:
        result = [d for d in result[0]['address_components']
                  if 'political' in d['types']]
        cities.loc[i, 'encity'] = result[0]['short_name']
        cities.loc[i, 'country'] = result[-1]['long_name']
    sleep(0.5)

In [None]:
#cities.to_csv('cities_en.tsv', sep='\t', encoding='utf-8', index=False)

# Get eng names of countries

In [None]:
countries = (treaties.country.values.tolist() +
             airrights.country.values.tolist())

countries = pd.DataFrame(list(set(countries)), columns=['ukcountry'])
countries['encountry'] = None

In [None]:
for i, country in tqdm(countries.iterrows()):
    result = gmaps.geocode(country['ukcountry'])
    if len(result) > 0:
        countries.loc[i, 'encountry'] = result[0]['address_components'][0]['long_name']
    sleep(0.5)

In [None]:
#countries.to_csv('countries_en.tsv', sep='\t', encoding='utf-8', index=False)

manually check cities