In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from uszipcode import ZipcodeSearchEngine # main
import zipcode # sub

In [2]:
data = pd.read_csv("article_after_processing10.csv", encoding='iso-8859-1')

# confirm there is no NAs
test = data.apply(lambda col: sum(col.isnull()), axis = 0)
test[test > 0] # diplay NA columns more than 0

funding_round    12
Specialties      33
City             46
State            46
latitude         46
longitude        46
dtype: int64

In [3]:
# minor clean up
data.loc[:,"CompanySize"] = data["CompanySize"].apply(
    lambda size: "11-50" if size == "Nov-50" else size)

# prepare training data
data = data[data.Country == "United States"]


# reset index because we need to refer this dataframe to print recommendations
data = data.reset_index(drop=True)
cols = ['CompanyName', 'money_raised_float','Founded', 'CompanySize', "City"]
temp = data[cols]
temp.columns = ['Company', 'Money_raised','Founded', 'Company Size', "City"] # rename columns

cols = ['money_raised_float','Founded', 'CompanySize'] +\
    list(data.loc[:,"Computer & Network Security & Hardware":"Real Estate"]) +\
    ["latitude", "longitude"]

x_train = data[cols]

In [4]:
#### Utitlity functions ##################################

def industry_interest_generator(industries, interests):
    lst = [1 if industry in interests else 0 for industry in industries]    
    return np.array([lst])

def industry_interest_decomposer(industries, df):
    if df.shape[0] != 1: # confirm the test_data has only one row
        print( "Error: Test_data must has a single row for prediction")
    
    # select columns in the array "industries"
    df = df[industries].T
    df.columns = ["dummary"]
    # filter the columns "industries" == 1
    df = df[df["dummary"] == 1]
    # return industries selected
    return list(df.T)

def get_city(zip_code):
    search = ZipcodeSearchEngine()
    if zip_code != "not found":
        myzip = search.by_zipcode(zip_code)
        if myzip:
            return myzip["City"]
        else:
            myzip = zipcode.isequal(zip_code)
            if myzip:
                return myzip.city

def get_latitude(zip_code):
    search = ZipcodeSearchEngine()
    if zip_code != "not found":
        myzip = search.by_zipcode(zip_code)
#         myzip = None
        if myzip:
            return myzip["Latitude"]
        else:
            myzip = zipcode.isequal(zip_code)
            if myzip:
                return myzip.lat
            else:
                return None
    else:
        return None
    
def get_longitude(zip_code):
    search = ZipcodeSearchEngine()
    if zip_code != "not found":
        myzip = search.by_zipcode(zip_code)
#         myzip = None
        if myzip:
            return myzip["Longitude"]
        else:
            myzip = zipcode.isequal(zip_code)
            if myzip:
                return myzip.lon
            else:
                return None
    else:
        return None

def get_lat_lon(zip_code):
    lat = get_latitude(zip_code)
    lon = get_longitude(zip_code)
    return np.array([[lat, lon, zip_code]])

In [5]:
##### Generate test data #################################

# money_raised, year founded, company size
x_test = pd.DataFrame.from_records(
    [(10, 2015, '11-50'),
     (20, 2010, '51-200'),
     (100, 2010, '1001-5000'),
     (200, 2013, '51-200')] )
# industries

industries = list(data.loc[:,"Computer & Network Security & Hardware":"Real Estate"])

ed_tech = industry_interest_generator(industries, ["Education", "Internet"])
food_tech = industry_interest_generator(industries, ["Food Business", "Internet"])
fin_tech = industry_interest_generator(industries, ["Financial Services", "Internet"])
health_tech = industry_interest_generator(industries, ["Healthcare_health", "Internet"])

industry_interests = np.concatenate((ed_tech, food_tech, fin_tech, health_tech), axis = 0)
industry_df = pd.DataFrame(industry_interests)

# location
SF1 = get_lat_lon(94103) # San Francisco
SF2 = get_lat_lon(94107) # San Francisco
NYC = get_lat_lon(10001) # New York City
bost = get_lat_lon(2110) # Boston

locations = np.concatenate((SF1, SF2, NYC, bost), axis = 0)
location_df = pd.DataFrame(locations)


# combine all into dataframe
# x_test = np.concatenate((x_test, industry_interests, locations), axis = 1)
x_test = pd.concat([x_test, industry_df, location_df], axis=1)

x_test.columns = list(x_train) + ["zip_code"] # name columns as same to x_train columns
x_test

industry_interest_decomposer(industries, x_test.ix[1:1,])

['Food Business', 'Internet']

In [6]:
#### Utitlity functions ##################################

def convert_CompanySize(size):
    if size == '11-50':
        return 1
    elif size == '51-200':
        return 2
    elif size == '201-500':
        return 3
    elif size == "501-1000":
        return 4
    elif size == "1001-5000":
        return 5
    elif size == "5001-10,000":
        return 6
    elif size == "10,001+":
        return 7
    else:
        return 0
    
def transform(x_train, x_test, companysize_transform =True, pandas_transform =True):
    col_names = list(x_train)
    
    ### transform x_train ##############
    # binning
    x_train.loc[:,"CompanySize"] = x_train.CompanySize.apply(lambda size: convert_CompanySize(size))
    
    # logarithmic transformation for money_raised_float because of some outliers
    x_train.loc[:,"money_raised_float"] = np.log(x_train["money_raised_float"])
    
    scaler = MinMaxScaler()
    transformed_x_train = scaler.fit_transform(x_train)
    
    ### transform x_test ###############
    # copy x_test because we need to keep this original data for later use
    x_test_copy = x_test.copy()
    
    # binning
    if companysize_transform:
        x_test_copy.loc[:,"CompanySize"] = x_test_copy.CompanySize.apply(lambda size: convert_CompanySize(size))
        
    # logarithmic transformation for money_raised_float because of some outliers
    x_test_copy.loc[:,"money_raised_float"] = np.log(x_test_copy["money_raised_float"])
    
    # remove zip_code
    x_test_copy.drop(["zip_code"], axis = 1, inplace = True)
    
    transformed_x_test = scaler.transform(x_test_copy)
    
    
    ### tranform numpy to pandas ######
    
    if pandas_transform:
        transformed_x_train = pd.DataFrame(transformed_x_train)
        transformed_x_test = pd.DataFrame(transformed_x_test)
        transformed_x_train.columns = col_names
        transformed_x_test.columns = col_names
        
    return transformed_x_train, transformed_x_test

In [7]:
train, test = transform(x_train, x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
train.describe()

Unnamed: 0,money_raised_float,Founded,CompanySize,Computer & Network Security & Hardware,Computer Software,Consumer Electronics,Consumers Goods & Services,Education,Entertainment,Financial Services,...,Healthcare_health,Human Resources,Information Technology and Services,Infrastructure,Internet,Marketing and Advertising,Niche,Real Estate,latitude,longitude
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,...,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,0.247376,0.89153,0.273408,0.073034,0.235955,0.022472,0.044944,0.039326,0.05618,0.129213,...,0.11236,0.033708,0.106742,0.02809,0.297753,0.151685,0.044944,0.033708,0.576703,0.347079
std,0.176857,0.091735,0.193665,0.260926,0.425792,0.148631,0.207765,0.194917,0.230918,0.336382,...,0.316699,0.180985,0.309655,0.165696,0.45856,0.359728,0.207765,0.180985,0.15498,0.42298
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.107813,0.871795,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.533543,0.005457
50%,0.209492,0.910256,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.549261,0.012133
75%,0.362341,0.935897,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.684945,0.877368
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
test.describe()

Unnamed: 0,money_raised_float,Founded,CompanySize,Computer & Network Security & Hardware,Computer Software,Consumer Electronics,Consumers Goods & Services,Education,Entertainment,Financial Services,...,Healthcare_health,Human Resources,Information Technology and Services,Infrastructure,Internet,Marketing and Advertising,Niche,Real Estate,latitude,longitude
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0.342458,0.935897,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.25,...,0.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.6358,0.488246
std,0.317414,0.031404,0.288675,0.0,0.0,0.0,0.0,0.5,0.0,0.5,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104623,0.558248
min,0.0,0.910256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.548766,0.005101
25%,0.118856,0.910256,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.549137,0.005265
50%,0.342458,0.929487,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.617519,0.474012
75%,0.56606,0.955128,0.291667,0.0,0.0,0.0,0.0,0.25,0.0,0.25,...,0.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.704182,0.956992
max,0.684916,0.974359,0.666667,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.759398,0.999856


In [48]:
#### Utitlity functions ##################################
def train_predict(train_data, test_data, num_companies=6):
    if test_data.shape[0] != 1: # confirm the test_data has only one row
        print( "Error: Test_data must has a single row for prediction")
    
    model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    model_knn.fit(train_data)
    
    distances, indices = model_knn.kneighbors(test_data, n_neighbors = num_companies)

#     print (data.ix[indices[0]])
#     print (data.ix[indices[1]])
#     print (data.ix[indices[2]])
#     print (data.ix[indices[3]])

    return indices

def print_preference(x_test):
    if x_test.shape[0] != 1: # confirm the test_data has only one row
        print( "Error: Test_data must has a single row for prediction")
    
    # get city
    city = get_city(str(int((x_test["zip_code"].item()))))
    
    # get industry
    industries = list(data.loc[:,"Computer & Network Security & Hardware":"Real Estate"])
    industry_of_interest = industry_interest_decomposer(industries, x_test)
    
    print ("-----------------------------------------------------------------------------------------------------")
    print ("Thank you for providing your interests! Below are the summary of your interests\n")
    print ("Headquarters:        {}".format(city))
    print ("Year founded:        {:.0f}".format(x_test["Founded"].item() ))
    print ("Company size:        {}".format(x_test["CompanySize"].item() ))
    print ("Industry:            {}".format(" & ".join(industry_of_interest)))

def print_recommendations(indices):
    # first company to recommend
    first_co = data.ix[indices[0][0], ]
    print ("-----------------------------------------------------------------------------------------------------")
    print ("We recommend to check '{}' that matches your interests!\n".format(first_co["CompanyName"]))
    print ("About the start up \n\n{}\n".format(first_co["Description"]))
    print ("Company details\n")
    print ("Website:             {}".format(first_co["Website"]))
    print ("Headquarters:        {}, {}".format(first_co["City"], first_co["State"]))
    print ("Year founded:        {:.0f}".format(first_co["Founded"]))
    print ("Company size:        {}".format(first_co["CompanySize"]))
    print ("Techcrunch article:  {}\n".format(first_co["link"]))
    print ("-----------------------------------------------------------------------------------------------------")
    print ("We also suggest checking following startups\n")
    # other companies to recommend
    pd.options.display.float_format = '{:,.0f}'.format # suppress float numbers
    print (temp.ix[indices[0][1:],])
    print ("-----------------------------------------------------------------------------------------------------")

### wrap above three functions into one
def generate_recommendation(transformed_train_data, transformed_test_data, original_test_data, num_companies = 6):
    
    indices = train_predict(transformed_train_data, transformed_test_data, num_companies=6)
    
    print_preference(original_test_data)
    print_recommendations(indices)

In [51]:
generate_recommendation(train, test.ix[0:0,], x_test.ix[0:0,])

-----------------------------------------------------------------------------------------------------
Thank you for providing your interests! Below are the summary of your interests

Headquarters:        San Francisco
Year founded:        2015
Company size:        11-50
Industry:            Education & Internet
-----------------------------------------------------------------------------------------------------
We recommend to check 'Edmodo' that matches your interests!

About the start up 

Our mission is to connect all learners to the people and resources needed to achieve their full potential. We are the world's leading global education network that provides communication, collaboration, and coaching tools for all members of the school community. We were founded in 2008 and currently have over 70 million members across 350,000+ schools in 150 countries.

The investors backing Edmodo are some of the best-recognized firms in the world, including Benchmark Capital, Greylock V