In [1]:
import sqlite3
import pandas as pd
import numpy as np
from pandasql import sqldf
from geopy.distance import great_circle

In [2]:
conn = sqlite3.connect("cs.db")

In [5]:
#Tables on db
pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'",conn)

Unnamed: 0,name
0,fields
1,ncaa
2,geo
3,inst


In [6]:
inst = pd.read_sql("SELECT * FROM inst",conn)

# Geocoding 

In [7]:
geo = pd.read_sql("SELECT * FROM geo",conn)

In [9]:
list(inst.columns)

['UNITID',
 'OPEID',
 'OPEID6',
 'INSTNM',
 'CITY',
 'STABBR',
 'ZIP',
 'INSTURL',
 'NPCURL',
 'HCM2',
 'PREDDEG',
 'HIGHDEG',
 'CONTROL',
 'LOCALE',
 'LATITUDE',
 'LONGITUDE',
 'HBCU',
 'PBI',
 'ANNHI',
 'TRIBAL',
 'AANAPII',
 'HSI',
 'NANTI',
 'MENONLY',
 'WOMENONLY',
 'RELAFFIL',
 'SATVR25',
 'SATVR75',
 'SATMT25',
 'SATMT75',
 'ACTCM25',
 'ACTCM75',
 'DISTANCEONLY',
 'UGDS',
 'UGDS_WHITE',
 'UGDS_BLACK',
 'UGDS_HISP',
 'UGDS_ASIAN',
 'UGDS_AIAN',
 'UGDS_NHPI',
 'UGDS_2MOR',
 'UGDS_NRA',
 'UGDS_UNKN',
 'PPTUG_EF',
 'CURROPER',
 'NPT4_PUB',
 'NPT4_PRIV',
 'NPT41_PUB',
 'NPT42_PUB',
 'NPT43_PUB',
 'NPT44_PUB',
 'NPT45_PUB',
 'NPT41_PRIV',
 'NPT42_PRIV',
 'NPT43_PRIV',
 'NPT44_PRIV',
 'NPT45_PRIV',
 'MD_EARN_WNE_P10',
 'C200_L4_POOLED_SUPP',
 'C200_4_POOLED_SUPP',
 'RET_FT4_POOLED_SUPP',
 'RET_FTL4_POOLED_SUPP',
 'RET_PT4_POOLED_SUPP',
 'RET_PTL4_POOLED_SUPP',
 'OMAWDP8_FTFT_POOLED_SUPP',
 'OMENRYP8_FTFT_POOLED_SUPP',
 'OMENRAP8_FTFT_POOLED_SUPP',
 'OMENRUP8_FTFT_POOLED_SUPP',
 'OMAWDP

In [8]:
input_zip = "30315"

In [56]:
lat =float(list(geo[geo['zip']==input_zip]['lat'])[0])
lon =float(list(geo[geo['zip']==input_zip]['lng'])[0])
print(f"lat, lon: ({lat}, {lon})")

lat, lon: (33.70312, -84.38272)


In [55]:
## Function to use to find the distance of each university in miles
def inst_dist (lat,lon):
    idx = inst.apply(lambda x: great_circle((x["LATITUDE"], x["LONGITUDE"]), (lat, lon)).miles, axis=1)
    return inst.loc[:, ['UNITID']].assign(DISTANCE_MI=idx)

In [57]:
inst[["UNITID","LATITUDE","LONGITUDE"]]

Unnamed: 0,UNITID,LATITUDE,LONGITUDE
0,100654,34.783368,-86.568502
1,100663,33.505697,-86.799345
2,100690,32.362609,-86.17401
3,100706,34.724557,-86.640449
4,100724,32.364317,-86.295677
...,...,...,...
6676,49576719,41.31559,-76.02235
6677,49576720,39.92142,-76.71277
6678,49576721,40.0466,-75.5302
6679,49576722,40.19437,-76.72659


In [60]:
#Distances of each university to the input zip
zip_all_dist = inst_dist(lat,lon)
zip_all_dist

Unnamed: 0,UNITID,DISTANCE_MI
0,100654,145.448313
1,100663,139.732231
2,100690,139.080645
3,100706,147.036320
4,100724,144.337436
...,...,...
6676,49576719,696.862958
6677,49576720,603.350253
6678,49576721,656.257973
6679,49576722,615.843200


In [7]:
sqldf("SELECT COUNT(*) FROM inst",locals())

Unnamed: 0,COUNT(*)
0,6681


In [150]:
basequery = """
SELECT *
FROM inst
"""


In [152]:
baseDF = sqldf(basequery,locals())

In [119]:
featuresDict = {}
featuresDict['x1']= 'OMENRUP_PELL_NFT_POOLED_SUPP'
featuresDict['x2']= 'OMENRYP_PELL_FT_POOLED_SUPP'
featuresDict['x3']= 'OMENRAP_PELL_FT_POOLED_SUPP'
featuresDict

{'x1': 'OMENRUP_PELL_NFT_POOLED_SUPP',
 'x2': 'OMENRYP_PELL_FT_POOLED_SUPP',
 'x3': 'OMENRAP_PELL_FT_POOLED_SUPP'}

In [120]:
featuresQuery = """
SELECT {}, {}, {}
FROM inst
limit 10
""".format(featuresDict['x1'],featuresDict['x2'],featuresDict['x3'])

In [121]:
featuresDF = sqldf(featuresQuery,locals())
featuresDF

Unnamed: 0,OMENRUP_PELL_NFT_POOLED_SUPP,OMENRYP_PELL_FT_POOLED_SUPP,OMENRAP_PELL_FT_POOLED_SUPP
0,0.2803,0.0116,0.3329
1,0.211,0.0149,0.2773
2,0.468,0.0,0.2798
3,0.187,0.0098,0.3154
4,0.2026,0.0097,0.3904
5,0.1646,0.0087,0.3037
6,0.4701,0.005,0.288
7,0.2607,0.0138,0.1394
8,0.2516,0.0123,0.4047
9,0.1099,0.0105,0.2098


Normalizing each feature

part 1: find each feature's length

In [122]:
maxQuery = """
SELECT max({0}) x1FeatureLength, max({1}) x2FeatureLength, max({2}) x3FeatureLength
FROM inst
where {0} not like 'PrivacySuppressed' and {1} not like 'PrivacySuppressed' and {2} not like 'PrivacySuppressed'
""".format(featuresDict['x1'],featuresDict['x2'],featuresDict['x3'])

In [123]:
featureLengthDF = sqldf(maxQuery,locals())

In [124]:
featureLengths = featureLengthDF.loc[0]

In [125]:
x1FeatureLength = float(test['x1FeatureLength'])
x2FeatureLength = float(test['x2FeatureLength'])
x3FeatureLength = float(test['x3FeatureLength'])

part 2: applying length to normalize each feature

In [148]:
normalizedQuery = """
SELECT 
{0}/{3} feature1normalized,
{1}/{4} feature2normalized,
{2}/{5} feature3normalized
FROM inst
""".format(featuresDict['x1'],featuresDict['x2'],featuresDict['x3'], x1FeatureLength, x2FeatureLength, x3FeatureLength)

In [149]:
normalizedDF = sqldf(normalizedQuery,locals())
normalizedDF 

Unnamed: 0,feature1normalized,feature2normalized,feature3normalized
0,0.298191,0.033420,0.401762
1,0.224468,0.042927,0.334661
2,0.497872,0.000000,0.337678
3,0.198936,0.028234,0.380642
4,0.215532,0.027946,0.471156
...,...,...,...
6676,,,
6677,,,
6678,,,
6679,,,


multiplying preferences to feature

In [139]:
prefVector = np.array([0.3,0.2,0.5])

In [140]:
normalizedMatrix = normalizedDF.values

In [141]:
normalizedMatrix

array([[0.29819149, 0.03341976, 0.40176201],
       [0.22446809, 0.04292711, 0.33466087],
       [0.49787234, 0.        , 0.33767801],
       ...,
       [       nan,        nan,        nan],
       [       nan,        nan,        nan],
       [       nan,        nan,        nan]])

In [142]:
ratingScore = np.dot(normalizedMatrix, prefVector)

In [153]:
baseDF['Rating'] = ratingScore

In [160]:
testQuery = """
SELECT count(*)
FROM baseDF
where rating is not NULL
"""

In [161]:
output = sqldf(testQuery,locals())
output

Unnamed: 0,count(*)
0,3513
