In [20]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [24]:
## Function to map input major names with output column names in dataframe
def degree_dicts(df, columns):
    temp_col = columns[columns['VARIABLE NAME'].str.contains('CIP')]
    var_name = temp_col['VARIABLE NAME']
    var_name = var_name.astype('string')
    
    degs = temp_col['NAME OF DATA ELEMENT'].str.split('Bachelor\'s degree in ', expand=True).iloc[:,1]
    degs = degs.str.split('.', expand=True).iloc[:,0]
    degs = degs.astype('string')
    return dict(zip(degs, var_name))

In [25]:
## Data we're currently using along with column names (to retrieve the right major column)
data = pd.read_csv('../data/final.csv')
columns = pd.read_excel('../data/scorecard/columns-simplified.xlsx')

In [26]:
## Dictionary to create query and importance
school_dict = {}

In [27]:
## All variables we'll be getting from front-end
## Anything commented out is a variable I don't know how to handle yet
## No weather, cost, or gpa in any of this code

major = "Engineering"
zipcode = 92804
sat = 1400
act = 30
states = ["CA", "NY"]
size = 2500
environment = 1 ## 1=city, 2=suburb, 3=town, 4=rural

In [28]:
## Adding variables to dictionary
school_dict[degree_dicts(data, columns)[major]] = []
school_dict[degree_dicts(data, columns)[major]].append(1)

school_dict['SAT_AVG'] = [] 
school_dict['SAT_AVG'].append(sat)

school_dict['ACTCMMID'] = []
school_dict['ACTCMMID'].append(act)

for state in states:
    school_dict[state] = []
    school_dict[state].append(1)
    
school_dict['UGDS'] = []
school_dict['UGDS'].append(size)

school_dict['LOCALE_' + str(environment)] = []
school_dict['LOCALE_' + str(environment)].append(1)

In [29]:
## Selecting Variable Importance
size_imp = 4
environment_imp = 2
states_imp = 3


## For any variable w/o a corresponding importance value, I set importance to avg (3)
school_dict[degree_dicts(data, columns)[major]].append(3)
school_dict['SAT_AVG'].append(3)
school_dict['ACTCMMID'].append(3)

for state in states:
    school_dict[state].append(states_imp)
    
school_dict['UGDS'].append(size_imp)

school_dict['LOCALE_' + str(environment)].append(environment_imp)

In [30]:
## Pre-process final dataset

## Change locale to 4 categories
data['LOCALE'] = (data['LOCALE'] / 10).astype(int)

## Fill non-religious schools with value
data['RELAFFIL'].fillna(-1, inplace=True)

## Change state to dummy variable
temp = pd.get_dummies(data, prefix = '', prefix_sep = '', columns=['STABBR'])

## Change locale to dummy variable
temp = pd.get_dummies(temp, columns=['LOCALE'])

## Get the important columns
temp = temp[school_dict.keys()]

## Normalize columns between 0 and 1 for both data and query point
## Need to handle highly skewed coluns (like size)
for col in temp.columns:
    school_dict[col] = school_dict[col] / temp[col].max()
    temp[col] = temp[col] / temp[col].max()

In [32]:
query = pd.DataFrame(pd.DataFrame(school_dict).iloc[0, :]).T
importance = pd.DataFrame(school_dict).iloc[1, :]

In [33]:
## Custom distance function with importance
def custom_dist(x, y, col_imp):
    sum_dist = 0
    for i in range(len(x)):
        sum_dist += col_imp[i]*(x[i] - y[i])**2
    return np.sqrt(sum_dist)

In [None]:
neigh = NearestNeighbors(metric=custom_dist, metric_params = {'col_imp': importance})
neigh.fit(temp)