In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [594]:
## Function to map input major names with output column names in dataframe
def degree_dicts(df, columns):
    temp_col = columns[columns['VARIABLE NAME'].str.contains('CIP')]
    var_name = temp_col['VARIABLE NAME']
    var_name = var_name.astype('string')
    
    degs = temp_col['NAME OF DATA ELEMENT'].str.split('Bachelor\'s degree in ', expand=True).iloc[:,1]
    degs = degs.str.split('.', expand=True).iloc[:,0]
    degs = degs.astype('string')
    return dict(zip(degs, var_name))

In [595]:
## Function to normalize query data
def normalize_cols(df, query, imp_only_cols):
    for col in query.columns:
        if col in imp_only_cols:
            continue
        mean = np.mean(df[col])
        std = np.std(df[col])
        query[col] = (query[col] - mean) / std

In [5]:
## Converts user zip to state
def user_zip_to_state(zip_to_state, zipcode):
    return zip_to_state[(zip_to_state['Zip Min'] <= zipcode) & (zip_to_state['Zip Max'] >= zipcode)].iloc[0, 0]

In [597]:
## Creates custom cost column based on user state for in-state and out-of-state tuition
def create_cost_column(data, user_state):
    cost_col = np.zeros(len(data))
    for i in range(len(data)):
        temp = data.iloc[i,:][['STABBR', 'TUITIONFEE_IN', 'TUITIONFEE_OUT']]
        if user_state == temp['STABBR']:
            cost_col[i] += temp['TUITIONFEE_IN']
        else:
            cost_col[i] += temp['TUITIONFEE_OUT']
    return cost_col

In [598]:
## Custom distance function with importance
def custom_dist(x, y, col_imp):
    sum_dist = 0
    for i in range(len(x)):
        sum_dist += col_imp[i]*(x[i] - y[i])**2
    return np.sqrt(sum_dist)

In [599]:
## Data we're currently using along with column names (to retrieve the right major column)
data = pd.read_csv('../data/final.csv')
norm = pd.read_csv('../data/final_standardized.csv')
zip_to_state = pd.read_csv('../data/zip_to_state.csv')
columns = pd.read_excel('../data/scorecard/columns-simplified.xlsx')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [600]:
## Dictionary to create query and importance
school_dict = {}

In [601]:
## All variables we'll be getting from front-end
## Anything commented out is a variable I don't know how to handle yet
## No weather, cost, or gpa in any of this code

## Variable Values
major = "Engineering"
zipcode = 92804
user_state = user_zip_to_state(zip_to_state, zipcode)
sat = 1400
act = 30
states = ["CA", "NY"]
size = 3
environment = 1 ## 1=city, 2=suburb, 3=town, 4=rural
climate = "BWk"
cost = 3

## Importance Variables
size_imp = 4
environment_imp = 2
states_imp = 3
weather_imp = 1
cost_imp = 3
teach_qual_imp = 5
select_imp = 5

In [604]:
## Adding variables to dictionary
school_dict[degree_dicts(data, columns)[major]] = [1]

school_dict['SAT_AVG'] = [sat] 

school_dict['ACTCMMID'] = [act]

for state in states:
    school_dict['STABBR.'+state] = [1]
    
school_dict['UGDS'] = [size]

school_dict['LOCALE.' + str(environment)] = [1]

school_dict['CLIMATE_ZONE.' + climate] = [1]

school_dict['TUITION'] = [cost]

school_dict['TEACH_QUAL'] = [0]

school_dict['SELECT'] = [0]

In [602]:
## Creates custom tuition column based on state
cost_col = create_cost_column(data, user_state)

## Adds in tuition column
data['TUITION'] = cost_col
cost_col = (cost_col - np.mean(cost_col)) / np.std(cost_col)
norm['TUITION'] = cost_col

In [36]:
## Dictionaries to convert categorical user input
size_ref = {1: 2500, 2: 5000, 3: 7500, 4: 10000, 5: 15000}
cost_ref = {1: 2500, 2: 10000, 3: 20000, 4: 35000, 5: 50000}
teach_qual_ref = {1: -1.25, 2: 0.25, 3: 1.75, 4: 3.25, 5: 4.75}

In [605]:
## Selecting Variable Importance

## For any variable w/o a corresponding importance value, I set importance to avg (3)
school_dict[degree_dicts(data, columns)[major]].append(3)
school_dict['SAT_AVG'].append(3)
school_dict['ACTCMMID'].append(3)

for state in states:
    school_dict['STABBR.' + state].append(states_imp)
    
school_dict['UGDS'].append(size_imp)

school_dict['LOCALE.' + str(environment)].append(environment_imp)

school_dict['CLIMATE_ZONE.' + climate].append(weather_imp * 0.2)

school_dict['TUITION'].append(cost_imp)

school_dict['TEACH_QUAL'].append(teach_qual_imp)

school_dict['SELECT'].append(select_imp)

In [606]:
## Convert categorical user input to actual values
school_dict['UGDS'][0] = size_ref[school_dict['UGDS'][0]]
school_dict['TUITION'][0] = cost_ref[school_dict['TUITION'][0]]

In [607]:
select_col = np.zeros(len(norm))
temp = norm[['ADM_RATE_ALL', 'GPA_BOTTOM_TEN_PERCENT']]
for i in range(len(norm)):
    select_col[i] += 0.5*-1*temp.iloc[i, 0] + 0.5*temp.iloc[i, 1]
norm['SELECT'] = select_col

In [608]:
teach_qual_col = np.zeros(len(norm))
temp = norm[['INEXPFTE', 'AVGFACSAL', 'PFTFAC']]
for i in range(len(norm)):
    teach_qual_col[i] += 0.25*temp.iloc[i, 0] + 0.60*temp.iloc[i, 1] + 0.15*temp.iloc[i, 2]
norm['TEACH_QUAL'] = teach_qual_col

In [609]:
query = pd.DataFrame(pd.DataFrame(school_dict).iloc[0, :]).T

## Normalizes query data
normalize_cols(data, query, ['TEACH_QUAL', 'SELECT'])
importance = pd.DataFrame(school_dict).iloc[1, :]

In [610]:
## Adds in normalized tuition column
norm = norm[school_dict.keys()]

In [611]:
neigh = NearestNeighbors(metric=custom_dist, metric_params = {'col_imp': importance})
neigh.fit(norm)

NearestNeighbors(metric=<function custom_dist at 0x7fa5845c04c0>,
                 metric_params={'col_imp': CIP14BACHL          3.0
SAT_AVG             3.0
ACTCMMID            3.0
STABBR.CA           3.0
STABBR.NY           3.0
UGDS                4.0
LOCALE.1            2.0
CLIMATE_ZONE.BWk    0.2
TUITION             3.0
TEACH_QUAL          5.0
SELECT              5.0
Name: 1, dtype: float64})

In [615]:
data.loc[neigh.kneighbors(query, len(data), return_distance=False)[0], ]

Unnamed: 0,UNITID,INSTNM,CITY,STABBR,ZIP,INSTURL,MAIN,NUMBRANCH,CONTROL,REGION,...,CLIMATE_ZONE.BWk,CLIMATE_ZONE.Cfa,CLIMATE_ZONE.Cfb,CLIMATE_ZONE.Csa,CLIMATE_ZONE.Csb,CLIMATE_ZONE.Dfa,CLIMATE_ZONE.Dfb,CLIMATE_ZONE.Dfc,CLIMATE_ZONE.Dwa,TUITION
163,121309,Point Loma Nazarene University,San Diego,CA,92106-2899,www.pointloma.edu/,1,1,2,8,...,0,0,0,0,0,0,0,0,0,36950.0
1139,196103,SUNY College of Environmental Science and Fore...,Syracuse,NY,13210,www.esf.edu/,1,1,1,2,...,0,0,0,0,0,0,1,0,0,19040.0
1155,196291,SUNY Maritime College,Throggs Neck,NY,10465-4198,www.sunymaritime.edu/,1,1,1,2,...,0,1,0,0,0,0,0,0,0,18418.0
1039,190512,CUNY Bernard M Baruch College,New York,NY,10010,www.baruch.cuny.edu/,1,1,1,2,...,0,1,0,0,0,0,0,0,0,15412.0
1180,197708,Yeshiva University,New York,NY,10033-3299,www.yu.edu/,1,1,2,2,...,0,1,0,0,0,0,0,0,0,44900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1933,243744,Stanford University,Stanford,CA,94305,www.stanford.edu/,1,1,2,8,...,0,0,0,0,1,0,0,0,0,53529.0
2245,484613,University of Phoenix-Arizona,Phoenix,AZ,85040,https://www.phoenix.edu/,1,16,3,6,...,0,0,0,0,0,0,0,0,0,9552.0
1478,214777,Pennsylvania State University-Main Campus,University Park,PA,16802-1503,www.psu.edu/,1,23,1,2,...,0,0,0,0,0,0,1,0,0,35514.0
961,183026,Southern New Hampshire University,Manchester,NH,03106-1045,www.snhu.edu/,1,1,2,1,...,0,0,0,0,0,0,1,0,0,31136.0


## Jordan's Functions

In [42]:
def similar_colleges(college_id):
    cols = ['ADM_RATE', 'ACTCMMID', 'SAT_AVG', 'UGDS', 'DIVERSITY', 'TEACH_QUAL', 'SELECT', 'TUITIONFEE_IN', 
            'TUITIONFEE_OUT']
    query = norm.loc[data[data['UNITID'] == college_id].index, cols]
    
    weights = {}
    for col in cols:
        weights[col] = [1]
    weights = pd.DataFrame(weights).iloc[0,:]
    neigh = NearestNeighbors(metric=custom_dist, metric_params = {'weights': weights})
    neigh.fit(norm[cols])
    return data.iloc[neigh.kneighbors(query, 4, return_distance=False)[0][1:], ]

In [44]:
data[data['INSTNM'].str.contains("Fullerton")]

Unnamed: 0,UNITID,INSTNM,CITY,ZIP,INSTURL,IMAGE,LONG_DESCRIPTION,CIP01BACHL,CIP03BACHL,CIP04BACHL,...,FINAID1,FINAID2,FINAID3,FINAID4,FINAID5,EXP_EARNINGS,DIVERSITY,TEACH_QUAL,SELECT,SELECT_CAT
92,110565,California State University-Fullerton,Fullerton,92831-3599,https://www.fullerton.edu/,110565.png,,0,0,0,...,13277,11468,8141,4652,948,43116.565917,0.200758,0.575124,0.413824,5


In [35]:
cols = ['ADM_RATE', 'ACTCMMID', 'SAT_AVG', 'UGDS', 'DIVERSITY', 'TEACH_QUAL', 'SELECT', 'TUITIONFEE_IN', 
        'TUITIONFEE_OUT']
weights = {}
for col in cols:
    weights[col] = [1]
weights = pd.DataFrame(weights).iloc[0, :]

In [45]:
similar_colleges(110565)

Unnamed: 0,UNITID,INSTNM,CITY,ZIP,INSTURL,IMAGE,LONG_DESCRIPTION,CIP01BACHL,CIP03BACHL,CIP04BACHL,...,FINAID1,FINAID2,FINAID3,FINAID4,FINAID5,EXP_EARNINGS,DIVERSITY,TEACH_QUAL,SELECT,SELECT_CAT
1679,228459,Texas State University,San Marcos,78666,https://www.txstate.edu/,default.png,Texas State University is a public research un...,1,1,1,...,11666,10499,8541,3108,1433,42147.632351,0.200758,0.371936,0.112938,6
1664,227216,University of North Texas,Denton,76203-1277,https://www.unt.edu/,default.png,"The University of North Texas (UNT), is a publ...",0,0,1,...,14941,14380,8085,1849,3096,41178.981486,0.200758,0.802028,0.112938,6
177,122755,San Jose State University,San Jose,95192-0001,https://www.sjsu.edu/,default.png,San José State University (commonly referred t...,0,1,0,...,13709,12897,8252,4844,1291,55095.542069,0.200758,0.649747,0.413824,5


In [8]:
def startup():
    global data
    global norm
    global zip_to_state
    global columns
    global degree_dicts
    global custom_dist
    
    ## Maps degree name to column
    def degree_dicts(df, columns):
        temp_col = columns[columns['VARIABLE NAME'].str.contains('CIP')]
        var_name = temp_col['VARIABLE NAME']
        var_name = var_name.astype('string')

        degs = temp_col['NAME OF DATA ELEMENT'].str.split('Bachelor\'s degree in ', expand=True).iloc[:,1]
        degs = degs.str.split('.', expand=True).iloc[:,0]
        degs = degs.astype('string')
        return dict(zip(degs, var_name))
    
    ## Custom distance metric
    def custom_dist(x, y, weights):
        sum_dist = 0
        for i in range(len(x)):
            sum_dist += weights[i]*(x[i] - y[i])**2
        return np.sqrt(sum_dist)
    
    ## Read in files
    data = pd.read_csv('../data/final.csv')
    norm = pd.read_csv('../data/final_standardized.csv')
    
    zip_to_state = pd.read_csv('../data/zip_to_state.csv')
    columns = pd.read_excel('../data/scorecard/columns-simplified.xlsx')
    
#     ## Create selectivity column
#     select_col = np.zeros(len(norm))
#     temp = norm[['ADM_RATE_ALL', 'GPA_BOTTOM_TEN_PERCENT']]
#     for i in range(len(norm)):
#         select_col[i] += 0.3*-1*temp.iloc[i, 0] + 0.7*temp.iloc[i, 1]
#     norm['SELECT'] = select_col
    
#     ## Create teaching quality column
#     teach_qual_col = np.zeros(len(norm))
#     temp = norm[['INEXPFTE', 'AVGFACSAL', 'PFTFAC']]
#     for i in range(len(norm)):
#         teach_qual_col[i] += 0.25*temp.iloc[i, 0] + 0.60*temp.iloc[i, 1] + 0.15*temp.iloc[i, 2]
#     norm['TEACH_QUAL'] = teach_qual_col

In [9]:
def submit_form(school_dict):
    ## Dictionaries to map size/cost references to numerical values
    size_ref = {1: 2500, 2: 5000, 3: 7500, 4: 10000, 5: 15000}
    cost_ref = {1: 2500, 2: 10000, 3: 20000, 4: 35000, 5: 50000}
    school_dict['UGDS'][0] = size_ref[school_dict['UGDS'][0]]
    school_dict['TUITION'][0] = cost_ref[school_dict['TUITION'][0]]
    school_dict['TEACH_QUAL'][0] = max(norm['TEACH_QUAL'])
    school_dict['SELECT'][0] = max(norm['SELECT'])
    
    ## Penalize teach_qual, and select
    school_dict['TEACH_QUAL'][1] *= 0.1
    school_dict['SELECT'][1] *= 0.1
    
    user_state = zip_to_state[(zip_to_state['Zip Min'] <= zipcode) & (zip_to_state['Zip Max'] >= zipcode)].iloc[0, 0]
    
    ## Create custom tuition column based on in-state and out-of-state
    cost_col = np.zeros(len(data))
    for i in range(len(data)):
        temp = data.iloc[i,:][['STABBR', 'TUITIONFEE_IN', 'TUITIONFEE_OUT']]
        if user_state == temp['STABBR']:
            cost_col[i] += temp['TUITIONFEE_IN']
        else:
            cost_col[i] += temp['TUITIONFEE_OUT']
    data['TUITION'] = cost_col
    cost_col = (cost_col - np.mean(cost_col)) / np.std(cost_col)
    norm['TUITION'] = cost_col
    
    
    query = pd.DataFrame(pd.DataFrame(school_dict).iloc[0, :]).T
    
    non_stand_col = [col for col in query.columns if 'CLIMATE' in col or 'STABBR' in col]
    major_names = degree_dicts(data, columns).values()
    non_stand_col.append('TEACH_QUAL')
    non_stand_col.append('SELECT')
    non_stand_col.extend([col for col in query.columns if col in major_names])

    for col in query.columns:
        if col in non_stand_col:
            continue
        mean = np.mean(data[col])
        std = np.std(data[col])
        query[col] = (query[col] - mean) / std
        
    print(query)
    
    weights = pd.DataFrame(school_dict).iloc[1, :]
    
    norm_nn = norm[school_dict.keys()]
    
    neigh = NearestNeighbors(metric=custom_dist, metric_params = {'weights': weights})
    neigh.fit(norm_nn)
    
    
    return data.loc[neigh.kneighbors(query, len(data), return_distance=False)[0], ]

In [10]:
def filt(df, filter_col, filter_val, how):
    
    if df[col].dtypes == np.dtype(object):
        return np.where(data[filter_col] == filter_val)
    else:
        if how == ">":
            return np.where(data[filter_col] > filter_val)
        elif how == ">=":
            return np.where(data[filter_col] >= filter_val)
        elif how == "<=":
            return np.where(data[filter_col] <= filter_val)
        elif how == "==":
            return np.where(data[filter_col] == filter_val)
        else:
            raise ValueError("Value for how should be one of >, >=, ==, <, <=")
        

In [11]:
startup()

In [12]:
school_dict = {}

## Variable Values
major = "Engineering"
zipcode = 92804
sat = 1400
act = 30
states = ["CA", "NY"]
size = 3
environment = 1 ## 1=city, 2=suburb, 3=town, 4=rural
climate = "BWk"
cost = 3

## Importance Variables
size_imp = 4
environment_imp = 2
states_imp = 5
weather_imp = 1
cost_imp = 3
teach_qual_imp = 3
select_imp = 3

## Adding variable values
school_dict[degree_dicts(data, columns)[major]] = [1]
school_dict['SAT_AVG'] = [sat] 
school_dict['ACTCMMID'] = [act]
for state in states:
    school_dict['STABBR.'+state] = [1]
school_dict['UGDS'] = [size]
school_dict['LOCALE.' + str(environment)] = [1]
school_dict['CLIMATE_ZONE.' + climate] = [1]
school_dict['TUITION'] = [cost]
school_dict['TEACH_QUAL'] = [0]
school_dict['SELECT'] = [0]

## Adding importance values
school_dict[degree_dicts(data, columns)[major]].append(3)
school_dict['SAT_AVG'].append(3)
school_dict['ACTCMMID'].append(3)

for state in states:
    school_dict['STABBR.' + state].append(states_imp)
school_dict['UGDS'].append(size_imp)
school_dict['LOCALE.' + str(environment)].append(environment_imp)
school_dict['CLIMATE_ZONE.' + climate].append(weather_imp)
school_dict['TUITION'].append(cost_imp)
school_dict['TEACH_QUAL'].append(teach_qual_imp)
school_dict['SELECT'].append(select_imp)

In [346]:
test = submit_form(school_dict)

   CIP14BACHL   SAT_AVG  ACTCMMID  STABBR.CA  STABBR.NY      UGDS  LOCALE.1  \
0         1.0  2.662203  2.239941        1.0        1.0  0.409653  0.978318   

   CLIMATE_ZONE.BWk   TUITION  TEACH_QUAL    SELECT  
0               1.0 -0.307578    5.469866  3.950843  
