In [1]:
import pandas as pd
%matplotlib inline
import pylab as plt
import numpy as np

# Distance from high school to college campuses

**Use: `easy_install googlemaps` to prepare required Python dependencies.**

**Please note, this notebook is not finished yet. We are still trying to fix the issues we encountered while using the Google Maps API. This notebook is meant to show our work and progress**.

While exploring our dataset we realized we need additional data to perform meaningful yield prediction. One of the features we think might be meaningful in predicting admission yield is the distance from a student's high school to the college campus.

We will user the google maps API to find the locations of high schools and the distance from the schools to each UC campus. 

In [2]:
import googlemaps

# Key left as a courtesy to the instructor.
key = 'AIzaSyAfMIzlBeHc_rJo1n1OgnRVGhvgWxY_MiE'
gmaps = googlemaps.Client(key=key)

## Finding location and distance using Google Maps API

Next, we will define functions to find the following for each high school/UC campus combo:
 - Location of the high school
 - Distance between the high school and the campus

In [3]:
import time
def get_distance(campus_abbr, school_strings):
    if isinstance(school_strings, str):
        school_strings = [school_strings]
    if campus_abbr == 'Universitywide':
        return [np.nan] * len(school_strings)
    campus_str = 'University of California, {}'.format(campus_abbr)
    
    #theres a max of 25 destinations per request so split them up
    N = 25
    chunks = [school_strings[i:i+N] for i in range(0, len(school_strings), N)]
    results = []
    for c in chunks:
        time.sleep(.5) #ensure we dont go over 100 elements/sec limit
#         print()
#         print('getting distance from ' + campus_str + ' to ' + ';\n'.join(c))
        try:
            response = gmaps.distance_matrix(origins=campus_str, destinations=c)
#             print(response)
#             print(response['status'])
            by_hs = response['rows'][0]['elements']
        except Exception as e:
            print("timed out")
            by_hs = [{} for school in c]
#         print('received', by_hs)
        for entry in by_hs:
            if 'distance' in entry:
                results.append(entry['distance']['value'])
            else:
#                 google maps couldnt look up that distance
                results.append(np.nan)
    return results

def get_school_loc_str(df):
    loc = df['school'].values.copy()
    loc += np.where(df['city'].notnull(),  ', '+df['city'], '' )
    loc += np.where(df['state'].notnull(), ', '+df['state'], '' )
    loc += np.where(df['country'].notnull(),  ', '+df['country'], '' )
    return loc

Next, we will use the above functions on our main dataset.

In [4]:
data = pd.read_csv('data/processed.csv')
data

Unnamed: 0,campus,year,school,school_num,city,county,state,country,region,ethnicity,app_num,adm_num,enr_num,app_gpa,adm_gpa,enr_gpa
0,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,All,14.0,,,3.620000,,
1,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,Asian,8.0,,,3.620000,,
2,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,Hispanic/ Latino,5.0,,,3.620000,,
3,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,52910,San Francisco,San Francisco,California,USA,San Francisco,All,58.0,8.0,7.0,3.682931,4.121250,4.088571
4,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,52910,San Francisco,San Francisco,California,USA,San Francisco,Asian,50.0,8.0,7.0,3.682931,4.121250,4.088571
5,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,53075,San Jose,Santa Clara,California,USA,Santa Clara,All,14.0,,,3.640714,,
6,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,53075,San Jose,Santa Clara,California,USA,Santa Clara,Hispanic/ Latino,6.0,,,3.640714,,
7,Berkeley,1994,ACADEMY OUR LADY OF PEACE,52820,San Diego,San Diego,California,USA,San Diego,All,5.0,,,3.786000,,
8,Berkeley,1994,ACALANES HIGH SCHOOL,51315,Lafayette,Contra Costa,California,USA,Contra Costa,All,61.0,30.0,13.0,3.557869,3.828333,3.563846
9,Berkeley,1994,ACALANES HIGH SCHOOL,51315,Lafayette,Contra Costa,California,USA,Contra Costa,Asian,16.0,4.0,,3.557869,3.828333,


## Issues

We have encountered multiple issues while trying to collec our location data. The main problems were:

 - Some high schools returned multiple very similar results
 - The Google Maps API only allows for a small number of API calls per day
 - The API crashed repeatedly
 
Below, one can find different attempts we made to query the API and deduplicate the location results.

In [5]:
get_distance("Berkeley", ["ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles", "LAWRENCEVILLE SCHOOL, Lawrenceville, New jersey"])

[601648, 4656150]

In [6]:
# no_dups = data.drop_duplicates(subset=['school_num'])
# no_dups

In [7]:
# data[data['campus']=='Riverside']

In [8]:
# no_dups[no_dups['campus']=='Riverside']

In [9]:
data['school_loc_str'] = get_school_loc_str(data)
data

Unnamed: 0,campus,year,school,school_num,city,county,state,country,region,ethnicity,app_num,adm_num,enr_num,app_gpa,adm_gpa,enr_gpa,school_loc_str
0,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,All,14.0,,,3.620000,,,"ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles, Cali..."
1,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,Asian,8.0,,,3.620000,,,"ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles, Cali..."
2,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,Hispanic/ Latino,5.0,,,3.620000,,,"ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles, Cali..."
3,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,52910,San Francisco,San Francisco,California,USA,San Francisco,All,58.0,8.0,7.0,3.682931,4.121250,4.088571,"ABRAHAM LINCOLN HIGH SCHOOL, San Francisco, Ca..."
4,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,52910,San Francisco,San Francisco,California,USA,San Francisco,Asian,50.0,8.0,7.0,3.682931,4.121250,4.088571,"ABRAHAM LINCOLN HIGH SCHOOL, San Francisco, Ca..."
5,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,53075,San Jose,Santa Clara,California,USA,Santa Clara,All,14.0,,,3.640714,,,"ABRAHAM LINCOLN HIGH SCHOOL, San Jose, Califor..."
6,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,53075,San Jose,Santa Clara,California,USA,Santa Clara,Hispanic/ Latino,6.0,,,3.640714,,,"ABRAHAM LINCOLN HIGH SCHOOL, San Jose, Califor..."
7,Berkeley,1994,ACADEMY OUR LADY OF PEACE,52820,San Diego,San Diego,California,USA,San Diego,All,5.0,,,3.786000,,,"ACADEMY OUR LADY OF PEACE, San Diego, Californ..."
8,Berkeley,1994,ACALANES HIGH SCHOOL,51315,Lafayette,Contra Costa,California,USA,Contra Costa,All,61.0,30.0,13.0,3.557869,3.828333,3.563846,"ACALANES HIGH SCHOOL, Lafayette, California, USA"
9,Berkeley,1994,ACALANES HIGH SCHOOL,51315,Lafayette,Contra Costa,California,USA,Contra Costa,Asian,16.0,4.0,,3.557869,3.828333,,"ACALANES HIGH SCHOOL, Lafayette, California, USA"


In [10]:
no_dups = data.drop_duplicates(['school_loc_str'])
no_dups.head()

Unnamed: 0,campus,year,school,school_num,city,county,state,country,region,ethnicity,app_num,adm_num,enr_num,app_gpa,adm_gpa,enr_gpa,school_loc_str
0,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,All,14.0,,,3.62,,,"ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles, Cali..."
3,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,52910,San Francisco,San Francisco,California,USA,San Francisco,All,58.0,8.0,7.0,3.682931,4.12125,4.088571,"ABRAHAM LINCOLN HIGH SCHOOL, San Francisco, Ca..."
5,Berkeley,1994,ABRAHAM LINCOLN HIGH SCHOOL,53075,San Jose,Santa Clara,California,USA,Santa Clara,All,14.0,,,3.640714,,,"ABRAHAM LINCOLN HIGH SCHOOL, San Jose, Califor..."
7,Berkeley,1994,ACADEMY OUR LADY OF PEACE,52820,San Diego,San Diego,California,USA,San Diego,All,5.0,,,3.786,,,"ACADEMY OUR LADY OF PEACE, San Diego, Californ..."
8,Berkeley,1994,ACALANES HIGH SCHOOL,51315,Lafayette,Contra Costa,California,USA,Contra Costa,All,61.0,30.0,13.0,3.557869,3.828333,3.563846,"ACALANES HIGH SCHOOL, Lafayette, California, USA"


In [11]:
data[data['campus']=="Riverside"]['school_loc_str'].unique()

array(['A B MILLER HIGH SCHOOL, Fontana, California, USA',
       'ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles, California, USA',
       'ABRAHAM LINCOLN HIGH SCHOOL, San Francisco, California, USA',
       'ACALANES HIGH SCHOOL, Lafayette, California, USA',
       'ADOLFO CAMARILLO HIGH SCHOOL, Camarillo, California, USA',
       'AGOURA HIGH SCHOOL, Agoura Hills, California, USA',
       'ALAMEDA HIGH SCHOOL, Alameda, California, USA',
       'ALBANY HIGH SCHOOL, Albany, California, USA',
       'ALEXANDER HAMILTON HIGH SCHOOL, Los Angeles, California, USA',
       'ALHAMBRA HIGH SCHOOL, Alhambra, California, USA',
       'ALTA LOMA HIGH SCHOOL, Alta Loma, California, USA',
       'ALVERNO HIGH SCHOOL, Sierra Madre, California, USA',
       'ANAHEIM HIGH SCHOOL, Anaheim, California, USA',
       'APPLE VALLEY HIGH SCHOOL, Apple Valley, California, USA',
       'AQUINAS HIGH SCHOOL, San Bernardino, California, USA',
       'ARAGON HIGH SCHOOL, San Mateo, California, USA',
       'ARCADIA

In [12]:
no_dups[no_dups['campus']=="Riverside"]['school_loc_str'].unique()

array(['HESPERIA CHRISTIAN SCHOOL, Hesperia, California, USA',
       'KING-DREW MEDICAL MAGNET HIGH, Los Angeles, California, USA',
       'CANYONVILLE CHRISTIAN ACADEMY, Canyonville, Oregon, USA',
       'CALVARY CHAPEL CHRISTIAN SCH, Moreno Valley, California, USA',
       'EXECUTIVE PREPARATORY ACDMY, Gardena, California, USA'],
      dtype=object)

## Saving results

Unfortunately, the googlemaps API crashed repeatedly. We decided, to save the results in a persistent dictionary. This way we were able to save results even if our API calls crashed.

In [13]:
gb = no_dups.groupby('campus')
to_be_done = gb.groups.keys()
campus_distances = {campus:{} for campus in to_be_done}

In [14]:
for campus in to_be_done:
    group = gb.get_group(campus)
    found_distances = campus_distances[campus].keys()
    not_done_group = group[  ~group['school_num'].isin(found_distances)   ]
    
    schools = not_done_group['school_loc_str'].values[:25]
    print("getting the distance from UC" + campus + " to " + str(len(schools)) + " schools")
    distances = get_distance(campus, schools)
    print("distances:", distances)
    new_distances = {id_num:distance for id_num, distance in zip(not_done_group['school_num'], distances)}
    campus_distances[campus].update(new_distances)

getting the distance from UCBerkeley to 25 schools
distances: [601648, 33037, 76043, 790444, 21980, 592504, 79299, 607116, 15660, 5348, 1746371, 600454, 609390, 38774, 168561, 655263, 612043, 51515, 46036, nan, 124045, 641611, 89754, 84186, 56408]
getting the distance from UCDavis to 25 schools
distances: [336147, 264388, 323402, 439923, 97146, 329684, 928956, 361311, 812423, 73399, 100832, 862, 120892, 610869, nan, 434859, 137408, 648412, 653398, 87228, 109851, 648412, 605610, 165189, 316770]
getting the distance from UCIrvine to 25 schools
distances: [58474, nan, nan, 75352, 92414, 81621, 154931, 70254, 24000, 148626, 133374, 22293, 5736676, 400190, 199732, 248320, nan, 52347, 1886946, nan, 4804142, 1918140, 1734174, nan, 77828]
getting the distance from UCLos Angeles to 25 schools
distances: [4573783, 11004, 3561951, 4426183, 4358482, 496111, 2355823, 31898, 20753, 1780754, 4907316, 3245232, 2165886, 1115181, 3157088, 1758190, 4280545, 215077, 4479446, 3317605, 4598571, 5425443, 426

## Results

Below we can find a list of distances from each UC campus to each high school. Unfortunately due to errors while working with the Google Maps API, we miss a large number of high schools. We are planning to fix this issue in the next iteration of our project.

In [15]:
campus_distances

{'Berkeley': {50003: 607116,
  50005: 15660,
  50035: 5348,
  50050: 609390,
  50077: 655263,
  50090: 641611,
  50115: 56408,
  50438: 592504,
  50974: 46036,
  51315: 21980,
  51520: 601648,
  51525: 600454,
  51915: 38774,
  52495: 51515,
  52742: 168561,
  52820: 790444,
  52910: 33037,
  53075: 76043,
  53077: 84186,
  53276: 79299,
  53345: 89754,
  53378: 612043,
  53442: 124045,
  320003: 1746371,
  680400: nan},
 'Davis': {50132: 648412,
  50173: 97146,
  50419: 329684,
  50535: 264388,
  50743: 862,
  50745: 165189,
  50876: 812423,
  50890: 439923,
  50896: 87228,
  50926: 361311,
  50930: 434859,
  51148: 323402,
  51317: 100832,
  51392: 605610,
  52021: 137408,
  52196: 109851,
  52613: 336147,
  52921: 120892,
  53063: 316770,
  53811: 73399,
  54024: 610869,
  54252: 653398,
  290096: 928956,
  525080: nan},
 'Irvine': {50122: 148626,
  50131: 75352,
  50334: 154931,
  50347: 81621,
  50363: 199732,
  51049: 52347,
  51363: 400190,
  51623: 77828,
  52134: 92414,
  5241

In [16]:
data[data['campus']=='Riverside']

Unnamed: 0,campus,year,school,school_num,city,county,state,country,region,ethnicity,app_num,adm_num,enr_num,app_gpa,adm_gpa,enr_gpa,school_loc_str
158727,Riverside,1994,A B MILLER HIGH SCHOOL,50944,Fontana,San Bernardino,California,USA,San Bernardino,All,31.0,27.0,11.0,3.764839,3.836667,3.810000,"A B MILLER HIGH SCHOOL, Fontana, California, USA"
158728,Riverside,1994,A B MILLER HIGH SCHOOL,50944,Fontana,San Bernardino,California,USA,San Bernardino,Hispanic/ Latino,19.0,17.0,6.0,3.764839,3.836667,3.810000,"A B MILLER HIGH SCHOOL, Fontana, California, USA"
158729,Riverside,1994,A B MILLER HIGH SCHOOL,50944,Fontana,San Bernardino,California,USA,San Bernardino,White,5.0,4.0,,3.764839,3.836667,,"A B MILLER HIGH SCHOOL, Fontana, California, USA"
158730,Riverside,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,All,22.0,14.0,6.0,3.279091,3.363571,3.436667,"ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles, Cali..."
158731,Riverside,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,Asian,13.0,10.0,5.0,3.279091,3.363571,3.436667,"ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles, Cali..."
158732,Riverside,1994,ABRAHAM LINCOLN HIGH SCHOOL,51520,Los Angeles,Los Angeles,California,USA,Los Angeles,Hispanic/ Latino,8.0,4.0,,3.279091,3.363571,,"ABRAHAM LINCOLN HIGH SCHOOL, Los Angeles, Cali..."
158733,Riverside,1994,ABRAHAM LINCOLN HIGH SCHOOL,52910,San Francisco,San Francisco,California,USA,San Francisco,All,13.0,,,3.300000,,,"ABRAHAM LINCOLN HIGH SCHOOL, San Francisco, Ca..."
158734,Riverside,1994,ACALANES HIGH SCHOOL,51315,Lafayette,Contra Costa,California,USA,Contra Costa,All,6.0,5.0,,3.311667,3.452000,,"ACALANES HIGH SCHOOL, Lafayette, California, USA"
158735,Riverside,1994,ADOLFO CAMARILLO HIGH SCHOOL,50438,Camarillo,Ventura,California,USA,Ventura,All,12.0,10.0,,3.685000,3.717000,,"ADOLFO CAMARILLO HIGH SCHOOL, Camarillo, Calif..."
158736,Riverside,1994,ADOLFO CAMARILLO HIGH SCHOOL,50438,Camarillo,Ventura,California,USA,Ventura,Asian,6.0,4.0,,3.685000,3.717000,,"ADOLFO CAMARILLO HIGH SCHOOL, Camarillo, Calif..."
