In [1]:
import sqlite3
import pandas as pd
import numpy as np
import hashlib
from pandasql import sqldf
from geopy.distance import great_circle

Part 1. Establishing Database

In [5]:
conn = sqlite3.connect("cs.db")

In [6]:
cur = conn.cursor()

In [7]:
#Tables on db
pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'",conn)

Unnamed: 0,name
0,fields
1,geo
2,inst
3,user
4,relig


In [50]:
#inst = pd.read_sql("SELECT * FROM inst",conn)

In [31]:
#conn.close()

Define Features and Filters

In [21]:
def features_dict(feature1, feature2, feature3):
    featuresDict = {}
    featuresDict['feature1']= feature1
    featuresDict['feature2']= feature2
    featuresDict['feature3']= feature3
    return featuresDict

In [22]:
featuresDict = features_dict('OMENRUP_PELL_NFT_POOLED_SUPP', 'OMENRYP_PELL_FT_POOLED_SUPP', 'OMENRAP_PELL_FT_POOLED_SUPP')

In [10]:
def filter_list(filter1, filter2):
    l = list()
    l.append(filter1)
    l.append(filter2)
    return l

In [11]:
filterList = filter_list('SATMT25','SATVR25')
filterList

['SATMT25', 'SATVR25']

In [12]:
def filter_values(filter1,filter1input, filter2, filter2input):
    filters_dict = {}
    filters_dict[filter1] = filter1input
    filters_dict[filter2] = filter2input
    return filters_dict

In [13]:
filterValues = filter_values('SATMT25', '586', 'SATVR25', '789')
filterValues

{'SATMT25': '586', 'SATVR25': '789'}

Part 2. User Inputs

Part 2a. Preferences:
Input provided: Importance from scale 1 to 10
Output created: Allocation Percentage

In [18]:
#a, b, c represents importance user provides for each preference
#must be in same order as they are called in normalized matrix
f1weight=4
f2weight=5
f3weight=6
raw_weights = np.array([f1weight, f2weight, f3weight])
sum_weights = np.sum(raw_weights)
perc_weights = raw_weights/sum_weights
perc_weights

array([0.26666667, 0.33333333, 0.4       ])

Part 2b. User ID Creation
Input provided: email
Output created: 1) user_id 2) inserts user_id and email into "user" table
Method: function (user_id_generator)

In [33]:
#Scenario 1 Testing
email = 'mkjones@gatech.edu'
user_id = int(hashlib.sha256(email.encode('utf-8')).hexdigest(), 16) % 10**8 # generate a unique numeric ID

In [34]:
user_id

9987952

In [35]:
#Scenario #2
#trying different email
email2 = 'michaelk.Jones@outlook.com'
user_id3 = int(hashlib.sha256(email2.encode('utf-8')).hexdigest(), 16) % 10**8 # generate a unique numeric ID
user_id3

80449489

In [36]:
#Scenario 3
#trying same email as scenario 2 but with all lowercase
email3 = 'michaelk.jones@outlook.com'
user_id4 = int(hashlib.sha256(email3.encode('utf-8')).hexdigest(), 16) % 10**8 # generate a unique numeric ID
user_id4

87012218

In [37]:
#Scenario 4
#same email as scenario 2 but added lower to have negate capitalization effects
email2 = 'michaelk.Jones@outlook.com'
user_id5 = int(hashlib.sha256(email2.lower().encode('utf-8')).hexdigest(), 16) % 10**8 # generate a unique numeric ID
user_id5

87012218

In [38]:
def user_id_generator(email):
    id = int(hashlib.sha256(email.lower().encode('utf-8')).hexdigest(), 16) % 10**8 # generate a unique numeric ID
    return id

In [39]:
#test case of function
user_id_generator(email = 'test123@outlook.com')

83932234

In [40]:
email = 'test123@outlook.com'
user_id = user_id_generator(email = 'test123@outlook.com')

if pd.read_sql(f"SELECT * FROM user WHERE user_id ={user_id}",conn).empty:
    cur.execute(f"INSERT INTO user values('{email}',{user_id})")
else:
    print("Email exists")

In [42]:
pd.read_sql(f"SELECT * FROM user",conn)

Unnamed: 0,email,user_id
0,mkjones@gatech.edu,9987952
1,mykek.jones@gmail.com,95829733
2,michaelk.jones@outlook.com,87012218
3,test1@GMAIL.COM,65768313
4,test1@gmail.com,45591513


Part 2.c: Zipcode
user provided: zipcode
output created: temp dataframe of distance in miles per institution

In [9]:
geo = pd.read_sql("SELECT * FROM geo",conn)

In [16]:
geo.printSchema()

AttributeError: 'DataFrame' object has no attribute 'printSchema'

In [45]:
input_zip = "30315"

In [46]:
lat =float(list(geo[geo['zip']==input_zip]['lat'])[0])
lon =float(list(geo[geo['zip']==input_zip]['lng'])[0])
print(f"lat, lon: ({lat}, {lon})")

lat, lon: (33.70312, -84.38272)


In [51]:
inst[["UNITID","LATITUDE","LONGITUDE"]]

Unnamed: 0,UNITID,LATITUDE,LONGITUDE
0,100654,34.783368,-86.568502
1,100663,33.505697,-86.799345
2,100690,32.362609,-86.17401
3,100706,34.724557,-86.640449
4,100724,32.364317,-86.295677
...,...,...,...
6676,49576719,41.31559,-76.02235
6677,49576720,39.92142,-76.71277
6678,49576721,40.0466,-75.5302
6679,49576722,40.19437,-76.72659


In [57]:
## Function to use to find the distance of each university in miles
def inst_dist (lat,lon):
    idx = inst.apply(lambda x: great_circle((x["LATITUDE"], x["LONGITUDE"]), (lat, lon)).miles, axis=1)
    column_name = f"{user_id}_DISTANCE_MI"  # Adding user_id to the column title
    return inst.loc[:, ['UNITID']].assign(**{column_name: idx})

In [59]:
def inst_dist (lat,lon):
    idx = inst.apply(lambda x: great_circle((x["LATITUDE"], x["LONGITUDE"]), (lat, lon)).miles, axis=1)
    column_name = f"{user_id}_DISTANCE_MI"  # Adding user_id to the column title
    return inst.loc[:, ['UNITID']].assign(dist_miles = idx)

In [61]:
#Distances of each university to the input zip
dist_temp_table = inst_dist(lat,lon)
type(dist_temp_table)
dist_temp_table
inst = pd.merge(inst, dist_temp_table, on='UNITID')

In [5]:
pd.read_sql("SELECT * FROM inst",conn)

Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,INSTURL,NPCURL,HCM2,...,OMENRUP_PELL_NFT_POOLED_SUPP,OMENRYP_PELL_FT_POOLED_SUPP,OMENRAP_PELL_FT_POOLED_SUPP,OMAWDP8_PELL_FT_POOLED_SUPP,OMENRUP_PELL_FT_POOLED_SUPP,OMENRYP_PELL_PT_POOLED_SUPP,OMENRAP_PELL_PT_POOLED_SUPP,OMAWDP8_PELL_PT_POOLED_SUPP,OMENRUP_PELL_PT_POOLED_SUPP,GT_THRESHOLD_P6_SUPP
0,100654,00100200,001002,Alabama A & M University,Normal,AL,35762,www.aamu.edu/,www.aamu.edu/admissions-aid/tuition-fees/net-p...,0,...,0.2803,0.0116,0.3329,0.2958,0.3597,0.0056,0.3889,0.1833,0.4222,0.4606
1,100663,00105200,001052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,https://www.uab.edu/,https://tcc.ruffalonl.com/University of Alabam...,0,...,0.211,0.0149,0.2773,0.5243,0.1835,0.0211,0.277,0.4354,0.2665,0.659
2,100690,02503400,025034,Amridge University,Montgomery,AL,36117-3553,www.amridgeuniversity.edu/,www2.amridgeuniversity.edu:9091/,0,...,0.468,0,0.2798,0.2523,0.4679,0,0.186,0.4186,0.3953,0.5132
3,100706,00105500,001055,University of Alabama in Huntsville,Huntsville,AL,35899,www.uah.edu/,finaid.uah.edu/,0,...,0.187,0.0098,0.3154,0.4961,0.1787,0.0105,0.3632,0.3474,0.2789,0.717
4,100724,00100500,001005,Alabama State University,Montgomery,AL,36104-0271,www.alasu.edu/,www.alasu.edu/cost-aid/tuition-costs/net-price...,0,...,0.2026,0.0097,0.3904,0.3061,0.2938,0.0778,0.4444,0.1111,0.3667,0.382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6676,49576719,00332917,003329,Pennsylvania State University-Penn State Wilke...,Lehman,PA,18627-0217,wilkesbarre.psu.edu/,,0,...,,,,,,,,,,
6677,49576720,00332918,003329,Pennsylvania State University-Penn State York,York,PA,17403-3326,york.psu.edu/,,0,...,,,,,,,,,,
6678,49576721,00332919,003329,Pennsylvania State University-Penn State Great...,Malvern,PA,19355-1488,greatvalley.psu.edu/,,0,...,,,,,,,,,,
6679,49576722,00332920,003329,Pennsylvania State University-Penn State Harri...,Middletown,PA,17057-4846,harrisburg.psu.edu/,,0,...,,,,,,,,,,


In [119]:
def transform_zip_to_dist_col(zipcode):
    geo = pd.read_sql("SELECT * FROM geo",conn)
    inst = pd.read_sql("SELECT * FROM inst",conn)
    lat =float(list(geo[geo['zip']==zipcode]['lat'])[0])
    lon =float(list(geo[geo['zip']==zipcode]['lng'])[0])
    idx = inst.apply(lambda x: great_circle((x["LATITUDE"], x["LONGITUDE"]), (lat, lon)).miles, axis=1)
    distDF = inst.loc[:, ['UNITID']].assign(dist_miles = idx)
    inst = pd.merge(inst, distDF, on='UNITID')
    return inst

In [120]:
test = transform_zip_to_dist_col('30097')
test

Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,INSTURL,NPCURL,HCM2,...,OMENRYP_PELL_FT_POOLED_SUPP,OMENRAP_PELL_FT_POOLED_SUPP,OMAWDP8_PELL_FT_POOLED_SUPP,OMENRUP_PELL_FT_POOLED_SUPP,OMENRYP_PELL_PT_POOLED_SUPP,OMENRAP_PELL_PT_POOLED_SUPP,OMAWDP8_PELL_PT_POOLED_SUPP,OMENRUP_PELL_PT_POOLED_SUPP,GT_THRESHOLD_P6_SUPP,dist_miles
0,100654,00100200,001002,Alabama A & M University,Normal,AL,35762,www.aamu.edu/,www.aamu.edu/admissions-aid/tuition-fees/net-p...,0,...,0.0116,0.3329,0.2958,0.3597,0.0056,0.3889,0.1833,0.4222,0.4606,147.593425
1,100663,00105200,001052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,https://www.uab.edu/,https://tcc.ruffalonl.com/University of Alabam...,0,...,0.0149,0.2773,0.5243,0.1835,0.0211,0.277,0.4354,0.2665,0.659,156.540810
2,100690,02503400,025034,Amridge University,Montgomery,AL,36117-3553,www.amridgeuniversity.edu/,www2.amridgeuniversity.edu:9091/,0,...,0,0.2798,0.2523,0.4679,0,0.186,0.4186,0.3953,0.5132,164.191256
3,100706,00105500,001055,University of Alabama in Huntsville,Huntsville,AL,35899,www.uah.edu/,finaid.uah.edu/,0,...,0.0098,0.3154,0.4961,0.1787,0.0105,0.3632,0.3474,0.2789,0.717,150.129854
4,100724,00100500,001005,Alabama State University,Montgomery,AL,36104-0271,www.alasu.edu/,www.alasu.edu/cost-aid/tuition-costs/net-price...,0,...,0.0097,0.3904,0.3061,0.2938,0.0778,0.4444,0.1111,0.3667,0.382,169.201972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6676,49576719,00332917,003329,Pennsylvania State University-Penn State Wilke...,Lehman,PA,18627-0217,wilkesbarre.psu.edu/,,0,...,,,,,,,,,,670.943595
6677,49576720,00332918,003329,Pennsylvania State University-Penn State York,York,PA,17403-3326,york.psu.edu/,,0,...,,,,,,,,,,577.729643
6678,49576721,00332919,003329,Pennsylvania State University-Penn State Great...,Malvern,PA,19355-1488,greatvalley.psu.edu/,,0,...,,,,,,,,,,630.970517
6679,49576722,00332920,003329,Pennsylvania State University-Penn State Harri...,Middletown,PA,17057-4846,harrisburg.psu.edu/,,0,...,,,,,,,,,,590.104574


Applying filters:

In [None]:
.set_index('UNITID', inplace=True)

In [121]:
#decided to call table directly
def apply_filters_and_features(featuresDict, filterList, filterValues):
    filterquery = """
    SELECT
    UNITID,
    {0},
    {1},
    {2}
    FROM test
    WHERE {3} <= {4}
    AND {5} <= {6}
    """.format(featuresDict['feature1'],featuresDict['feature2'],featuresDict['feature3'], filterList[0], filterValues[filterList[0]],filterList[1],  filterValues[filterList[1]])
    return sqldf(filterquery,globals())

In [122]:
appliedffs = apply_filters_and_features(featuresDict = featuresDict,filterList=filterList,filterValues=filterValues)

In [71]:
#stuck on getting argument df to work in format function
def apply_filters(df, featuresDict, filterList, filterValues):
    filterquery = """
    SELECT
    {1},
    {2},
    {3}
    FROM {0}
    WHERE {4} <= {5}
    AND {6} <= {7}
    """.format(df,featuresDict['feature1'],featuresDict['feature2'],featuresDict['feature3'], filterList[0], filterValues[filterList[0]],filterList[1],  filterValues[filterList[1]])
    return sqldf(filterquery,locals())

In [85]:
#stuck on getting argument df to work in format function
def apply_filters(df, featuresDict, filterList, filterValues):
    filterquery = """
    SELECT
    {1},
    {2},
    {3}
    FROM {0}
    WHERE {4} <= {5}
    AND {6} <= {7}
    """.format(df, featuresDict['feature1'],featuresDict['feature2'],featuresDict['feature3'], filterList[0], filterValues[filterList[0]],filterList[1],  filterValues[filterList[1]])
    return sqldf(filterquery,globals())

In [65]:
#stuck on getting argument df to work in format function
def apply_filters(df, featuresDict, filterList, filterValues):
    filterquery = f"""
    SELECT
    {featuresDict['feature1']},
    {featuresDict['feature2']},
    {featuresDict['feature3']}
    FROM {df}
    WHERE {filterList[0]} <= {filterValues[filterList[0]]}
    AND {filterList[1]} <= {filterValues[filterList[1]]}
    """
    return sqldf(filterquery,globals())

In [66]:
appliedfilters = apply_filters(df = test, featuresDict = featuresDict,filterList=filterList,filterValues=filterValues)
appliedfilters

PandaSQLException: (sqlite3.OperationalError) near "OPEID6": syntax error
[SQL: 
    SELECT
    OMENRUP_PELL_NFT_POOLED_SUPP,
    OMENRYP_PELL_FT_POOLED_SUPP,
    OMENRAP_PELL_FT_POOLED_SUPP
    FROM         UNITID     OPEID  OPEID6   
0       100654  00100200  001002  \
1       100663  00105200  001052   
2       100690  02503400  025034   
3       100706  00105500  001055   
4       100724  00100500  001005   
...        ...       ...     ...   
6676  49576719  00332917  003329   
6677  49576720  00332918  003329   
6678  49576721  00332919  003329   
6679  49576722  00332920  003329   
6680  49576723  00332921  003329   

                                                 INSTNM        CITY STABBR   
0                              Alabama A & M University      Normal     AL  \
1                   University of Alabama at Birmingham  Birmingham     AL   
2                                    Amridge University  Montgomery     AL   
3                   University of Alabama in Huntsville  Huntsville     AL   
4                              Alabama State University  Montgomery     AL   
...                                                 ...         ...    ...   
6676  Pennsylvania State University-Penn State Wilke...      Lehman     PA   
6677      Pennsylvania State University-Penn State York        York     PA   
6678  Pennsylvania State University-Penn State Great...     Malvern     PA   
6679  Pennsylvania State University-Penn State Harri...  Middletown     PA   
6680  Pennsylvania State University-Penn State Brand...       Media     PA   

             ZIP                     INSTURL   
0          35762               www.aamu.edu/  \
1     35294-0110        https://www.uab.edu/   
2     36117-3553  www.amridgeuniversity.edu/   
3          35899                www.uah.edu/   
4     36104-0271              www.alasu.edu/   
...          ...                         ...   
6676  18627-0217        wilkesbarre.psu.edu/   
6677  17403-3326               york.psu.edu/   
6678  19355-1488        greatvalley.psu.edu/   
6679  17057-4846         harrisburg.psu.edu/   
6680  19063-5522         brandywine.psu.edu/   

                                                 NPCURL HCM2  ...   
0     www.aamu.edu/admissions-aid/tuition-fees/net-p...    0  ...  \
1     https://tcc.ruffalonl.com/University of Alabam...    0  ...   
2                      www2.amridgeuniversity.edu:9091/    0  ...   
3                                       finaid.uah.edu/    0  ...   
4     www.alasu.edu/cost-aid/tuition-costs/net-price...    0  ...   
...                                                 ...  ...  ...   
6676                                               None    0  ...   
6677                                               None    0  ...   
6678                                               None    0  ...   
6679                                               None    0  ...   
6680                                               None    0  ...   

     OMENRYP_PELL_FT_POOLED_SUPP OMENRAP_PELL_FT_POOLED_SUPP   
0                         0.0116                      0.3329  \
1                         0.0149                      0.2773   
2                              0                      0.2798   
3                         0.0098                      0.3154   
4                         0.0097                      0.3904   
...                          ...                         ...   
6676                        None                        None   
6677                        None                        None   
6678                        None                        None   
6679                        None                        None   
6680                        None                        None   

     OMAWDP8_PELL_FT_POOLED_SUPP OMENRUP_PELL_FT_POOLED_SUPP   
0                         0.2958                      0.3597  \
1                         0.5243                      0.1835   
2                         0.2523                      0.4679   
3                         0.4961                      0.1787   
4                         0.3061                      0.2938   
...                          ...                         ...   
6676                        None                        None   
6677                        None                        None   
6678                        None                        None   
6679                        None                        None   
6680                        None                        None   

     OMENRYP_PELL_PT_POOLED_SUPP OMENRAP_PELL_PT_POOLED_SUPP   
0                         0.0056                      0.3889  \
1                         0.0211                       0.277   
2                              0                       0.186   
3                         0.0105                      0.3632   
4                         0.0778                      0.4444   
...                          ...                         ...   
6676                        None                        None   
6677                        None                        None   
6678                        None                        None   
6679                        None                        None   
6680                        None                        None   

     OMAWDP8_PELL_PT_POOLED_SUPP OMENRUP_PELL_PT_POOLED_SUPP   
0                         0.1833                      0.4222  \
1                         0.4354                      0.2665   
2                         0.4186                      0.3953   
3                         0.3474                      0.2789   
4                         0.1111                      0.3667   
...                          ...                         ...   
6676                        None                        None   
6677                        None                        None   
6678                        None                        None   
6679                        None                        None   
6680                        None                        None   

     GT_THRESHOLD_P6_SUPP  dist_miles  
0                  0.4606  147.593425  
1                   0.659  156.540810  
2                  0.5132  164.191256  
3                   0.717  150.129854  
4                   0.382  169.201972  
...                   ...         ...  
6676                 None  670.943595  
6677                 None  577.729643  
6678                 None  630.970517  
6679                 None  590.104574  
6680                 None  630.528504  

[6681 rows x 175 columns]
    WHERE SATMT25 <= 586
    AND SATVR25 <= 789
    ]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

Normalizing Features

part 1: find each feature's length

In [32]:
maxQuery = """
SELECT max({0}) x1FeatureLength, max({1}) x2FeatureLength, max({2}) x3FeatureLength
FROM appliedffs
where {0} not like 'PrivacySuppressed' and {1} not like 'PrivacySuppressed' and {2} not like 'PrivacySuppressed'
""".format(featuresDict['feature1'],featuresDict['feature2'],featuresDict['feature3'])

In [45]:
featureLengths = sqldf(maxQuery,locals()).loc[0]
featureLengths

x1FeatureLength    0.8871
x2FeatureLength     0.241
x3FeatureLength    0.5972
Name: 0, dtype: object

In [46]:
x1FeatureLength = float(featureLengths['x1FeatureLength'])
x2FeatureLength = float(featureLengths['x2FeatureLength'])
x3FeatureLength = float(featureLengths['x3FeatureLength'])

In [47]:
x1FeatureLength

0.8871

part 2: applying length to normalize each feature

In [None]:
normalizedQuery = """
SELECT
{0}/{3} feature1normalized,
{1}/{4} feature2normalized,
{2}/{5} feature3normalized
FROM inst
""".format(featuresDict['x1'],featuresDict['x2'],featuresDict['x3'], x1FeatureLength, x2FeatureLength, x3FeatureLength)

In [None]:
normalizedDF = sqldf(normalizedQuery,locals())
normalizedDF

In [141]:
def normalize_features (features_Dict):
    #find length of each feature
    maxQuery = """
    SELECT max({0}) x1FeatureLength, max({1}) x2FeatureLength, max({2}) x3FeatureLength
    FROM appliedffs
    where {0} not like 'PrivacySuppressed' and {1} not like 'PrivacySuppressed' and {2} not like 'PrivacySuppressed'
    """.format(featuresDict['feature1'],featuresDict['feature2'],featuresDict['feature3'])

    featureLengths = sqldf(maxQuery,globals()).loc[0]

    x1FeatureLength = float(featureLengths['x1FeatureLength'])
    x2FeatureLength = float(featureLengths['x2FeatureLength'])
    x3FeatureLength = float(featureLengths['x3FeatureLength'])

    #normalize each feature
    normalizedQuery = """
    SELECT
    UNITID,
    {0}/{3} feature1normalized,
    {1}/{4} feature2normalized,
    {2}/{5} feature3normalized
    FROM appliedffs
    """.format(featuresDict['feature1'],featuresDict['feature2'],featuresDict['feature3'], x1FeatureLength, x2FeatureLength, x3FeatureLength)

    return sqldf(normalizedQuery,globals())

In [142]:
normalizedByUni = normalize_features(featuresDict)
#normalizedByUni.set_index('UNITID', inplace = True)

In [143]:
normalizedByUni

Unnamed: 0,UNITID,feature1normalized,feature2normalized,feature3normalized
0,100654,0.315973,0.048133,0.557435
1,100663,0.237854,0.061826,0.464334
2,100706,0.210799,0.040664,0.528131
3,100724,0.228385,0.040249,0.653717
4,100751,0.185548,0.036100,0.508540
...,...,...,...,...
928,487524,0.357682,0.007054,0.340589
929,488305,,,
930,488785,,,
931,490805,0.277083,0.048133,0.608841


Calculating Uni Rating

multiplying preferences to feature

In [169]:
def uni_rating_calc(df):
    normalizedMatrix = df.values
    ratingScore = np.dot(normalizedMatrix[:,1:4], perc_weights)
    return pd.DataFrame(np.c_[df.values[:, :1], ratingScore]).rename(columns = {0:'UNITID', 1:'Rating'})

In [170]:
test = uni_rating_calc(normalizedByUni)
test

Unnamed: 0,UNITID,Rating
0,100654,0.323278
1,100663,0.26977
2,100706,0.28102
3,100724,0.335806
4,100751,0.264929
...,...,...
928,487524,0.233969
929,488305,
930,488785,
931,490805,0.333469


In [162]:
normalizedByUni

Unnamed: 0,UNITID,feature1normalized,feature2normalized,feature3normalized
0,100654,0.315973,0.048133,0.557435
1,100663,0.237854,0.061826,0.464334
2,100706,0.210799,0.040664,0.528131
3,100724,0.228385,0.040249,0.653717
4,100751,0.185548,0.036100,0.508540
...,...,...,...,...
928,487524,0.357682,0.007054,0.340589
929,488305,,,
930,488785,,,
931,490805,0.277083,0.048133,0.608841


In [151]:
normalizedMatrix = normalizedByUni.values
normalizedMatrix[:,1:4]

array([[0.3159733964603765, 0.04813278008298755, 0.5574346952444742],
       [0.2378536805320708, 0.06182572614107884, 0.4643335565974548],
       [0.21079923345733287, 0.04066390041493776, 0.528131279303416],
       ...,
       [nan, nan, nan],
       [0.27708262879044077, 0.04813278008298755, 0.608841259209645],
       [0.19467929207530155, 0.03941908713692946, 0.37458137977227063]],
      dtype=object)

In [165]:
uniIndex = normalizedByUni.values[:, :1]

In [166]:
uniIndex

array([['100654'],
       ['100663'],
       ['100706'],
       ['100724'],
       ['100751'],
       ['100830'],
       ['100858'],
       ['100937'],
       ['101189'],
       ['101435'],
       ['101480'],
       ['101541'],
       ['101648'],
       ['101709'],
       ['102049'],
       ['102094'],
       ['102234'],
       ['102368'],
       ['102377'],
       ['102614'],
       ['104151'],
       ['104179'],
       ['104586'],
       ['105589'],
       ['105899'],
       ['106245'],
       ['106342'],
       ['106397'],
       ['106412'],
       ['106458'],
       ['106704'],
       ['106713'],
       ['107044'],
       ['107071'],
       ['107141'],
       ['107512'],
       ['107558'],
       ['107877'],
       ['110097'],
       ['110361'],
       ['110413'],
       ['110510'],
       ['110529'],
       ['110538'],
       ['110556'],
       ['110565'],
       ['110583'],
       ['110592'],
       ['110608'],
       ['110617'],
       ['110671'],
       ['110714'],
       ['111

In [140]:
test = np.c_[uniIndex, normalizedMatrix]
test

array([['100654', 0.3159733964603765, 0.04813278008298755,
        0.5574346952444742],
       ['100663', 0.2378536805320708, 0.06182572614107884,
        0.4643335565974548],
       ['100706', 0.21079923345733287, 0.04066390041493776,
        0.528131279303416],
       ...,
       ['488785', nan, nan, nan],
       ['490805', 0.27708262879044077, 0.04813278008298755,
        0.608841259209645],
       ['495767', 0.19467929207530155, 0.03941908713692946,
        0.37458137977227063]], dtype=object)

In [None]:
baseDF['Rating'] = ratingScore

In [None]:
testQuery = """
SELECT count(*)
FROM baseDF
where rating is not NULL
"""

In [None]:
output = sqldf(testQuery,locals())
output

apply filters