In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
data = pd.read_csv("article_after_processing10.csv", encoding='iso-8859-1')

# confirm there is no NAs
test = data.apply(lambda col: sum(col.isnull()), axis = 0)
test[test > 0] # diplay NA columns more than 0

funding_round    12
Specialties      33
City             46
State            46
latitude         46
longitude        46
dtype: int64

In [3]:
# prepare training data
data = data[data.Country == "United States"]

# reset index because we need to refer this dataframe to print recommendations
data = data.reset_index(drop=True)
cols = ['CompanyName', 'money_raised_float','Founded', 'CompanySize']
temp = data[cols]

cols = ['money_raised_float','Founded', 'CompanySize'] + list(data.loc[:,"Computer & Network Security & Hardware":"Real Estate"])
x_train = data[cols]

In [4]:
#### Utitlity functions ##################################

def industry_interest_generator(industries, interests):
    lst = [1 if industry in interests else 0 for industry in industries]    
    return np.array([lst])

def industry_interest_decomposer(industries, df):
    if df.shape[0] != 1: # confirm the test_data has only one row
        print( "Error: Test_data must has a single row for prediction")
    
    # select columns in the array "industries"
    df = df[industries].T
    df.columns = ["dummary"]
    # filter the columns "industries" == 1
    df = df[df["dummary"] == 1]
    # return industries selected
    return list(df.T)

In [5]:
##### Generate test data #################################

industries = list(data.loc[:,"Computer & Network Security & Hardware":"Real Estate"])

ed_tech = industry_interest_generator(industries, ["Education", "Internet"])
food_tech = industry_interest_generator(industries, ["Food Business", "Internet"])
fin_tech = industry_interest_generator(industries, ["Financial Services", "Internet"])
health_tech = industry_interest_generator(industries, ["Healthcare_health", "Internet"])

industry_interests = np.concatenate((ed_tech, food_tech, fin_tech, health_tech), axis = 0)

x_test = np.array([[10, 2015, 1],
                 [20, 2010, 2],
                [100, 2010, 4],
                  [200, 2013, 2]])

x_test = np.concatenate((x_test, industry_interests), axis = 1)

x_test = pd.DataFrame(x_test)
x_test.columns = list(x_train) # name columns as same to x_train columns
x_test

industry_interest_decomposer(industries, x_test.ix[1:1,])

['Food Business', 'Internet']

In [6]:
#### Utitlity functions ##################################

def convert_CompanySize(size):
    if size == 'Nov-50':
        return 1
    elif size == '51-200':
        return 2
    elif size == '201-500':
        return 3
    elif size == "501-1000":
        return 4
    elif size == "1001-5000":
        return 5
    elif size == "5001-10,000":
        return 6
    elif size == "10,001+":
        return 7
    else:
        return 0
    
def transform(x_train, x_test, companysize_transform =False, pandas_transform =True):
    col_names = list(x_train)
    
    ### transform x_train ##############
    # binning
    x_train.loc[:,"CompanySize"] = x_train.CompanySize.apply(lambda size: convert_CompanySize(size))
    
    # logarithmic transformation for money_raised_float because of some outliers
    x_train.loc[:,"money_raised_float"] = np.log(x_train["money_raised_float"])
    
    scaler = MinMaxScaler()
    transformed_x_train = scaler.fit_transform(x_train)
    
    ### transform x_test ###############
    # binning
    if companysize_transform:
        x_test.loc[:,"CompanySize"] = x_test.CompanySize.apply(lambda size: convert_CompanySize(size))
        
    # logarithmic transformation for money_raised_float because of some outliers
    x_test.loc[:,"money_raised_float"] = np.log(x_test["money_raised_float"])
    
    transformed_x_test = scaler.transform(x_test)
    
    ### tranform numpy to pandas ######
    
    if pandas_transform:
        transformed_x_train = pd.DataFrame(transformed_x_train)
        transformed_x_test = pd.DataFrame(transformed_x_test)
        transformed_x_train.columns = col_names
        transformed_x_test.columns = col_names
        
    return transformed_x_train, transformed_x_test

In [7]:
train, test = transform(x_train, x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
train.describe()

Unnamed: 0,money_raised_float,Founded,CompanySize,Computer & Network Security & Hardware,Computer Software,Consumer Electronics,Consumers Goods & Services,Education,Entertainment,Financial Services,Food Business,Healthcare_health,Human Resources,Information Technology and Services,Infrastructure,Internet,Marketing and Advertising,Niche,Real Estate
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,0.247376,0.89153,0.273408,0.073034,0.235955,0.022472,0.044944,0.039326,0.05618,0.129213,0.033708,0.11236,0.033708,0.106742,0.02809,0.297753,0.151685,0.044944,0.033708
std,0.176857,0.091735,0.193665,0.260926,0.425792,0.148631,0.207765,0.194917,0.230918,0.336382,0.180985,0.316699,0.180985,0.309655,0.165696,0.45856,0.359728,0.207765,0.180985
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.107813,0.871795,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.209492,0.910256,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.362341,0.935897,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
test.describe()

Unnamed: 0,money_raised_float,Founded,CompanySize,Computer & Network Security & Hardware,Computer Software,Consumer Electronics,Consumers Goods & Services,Education,Entertainment,Financial Services,Food Business,Healthcare_health,Human Resources,Information Technology and Services,Infrastructure,Internet,Marketing and Advertising,Niche,Real Estate
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0.342458,0.935897,0.208333,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.25,0.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0
std,0.317414,0.031404,0.209718,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.910256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.118856,0.910256,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.342458,0.929487,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.56606,0.955128,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.25,0.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,0.684916,0.974359,0.5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
#### Utitlity functions ##################################
def train_predict(train_data, test_data, num_companies=6):
    if test_data.shape[0] != 1: # confirm the test_data has only one row
        print( "Error: Test_data must has a single row for prediction")
    
    model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    model_knn.fit(train_data)
    
    distances, indices = model_knn.kneighbors(test_data, n_neighbors = num_companies)

#     print (data.ix[indices[0]])
#     print (data.ix[indices[1]])
#     print (data.ix[indices[2]])
#     print (data.ix[indices[3]])

    return indices

def print_preference(x_test):
    if x_test.shape[0] != 1: # confirm the test_data has only one row
        print( "Error: Test_data must has a single row for prediction")
    
    industries = list(data.loc[:,"Computer & Network Security & Hardware":"Real Estate"])
    industry_of_interest = industry_interest_decomposer(industries, x_test)
    
    print ("----------------------------------------------------------------------------------------")
    print ("Thank you for providing your interests! Below are the summary of your interests\n")
    print ("Headquarters:        {}".format("dummy"))
    print ("Year founded:        {}".format(x_test["Founded"].to_string(index=False)))
    print ("Company size:        {}".format(x_test["CompanySize"].to_string(index=False)))
    print ("Industry:            {}".format(" & ".join(industry_of_interest)))

def print_recommendations(indices):
    # first company to recommend
    first_co = data.ix[indices[0][0], ]
    print ("------------------------------------------------------------------------------------------------------------------")
    print ("We recommend to check '{}' that matches your interests!\n".format(first_co["CompanyName"]))
    print ("About the start up \n\n{}\n".format(first_co["Description"]))
    print ("Company details\n")
    print ("Website:             {}".format(first_co["Website"]))
    print ("Headquarters:        {}, {}".format(first_co["City"], first_co["State"]))
    print ("Year founded:        {:.0f}".format(first_co["Founded"]))
    print ("Company size:        {}".format(first_co["CompanySize"]))
    print ("Techcrunch article:  {}\n".format(first_co["link"]))
    print ("------------------------------------------------------------------------------------------------------------------")
    print ("We also suggest checking following startups\n")
    # other companies to recommend
    print (temp.ix[indices[0][1:],])
    print ("------------------------------------------------------------------------------------------------------------------")

### wrap above three functions into one
def generate_recommendation(transformed_train_data, transformed_test_data, original_test_data, num_companies = 6):
    
    indices = train_predict(transformed_train_data, transformed_test_data, num_companies=6)
    
    print_preference(original_test_data)
    print_recommendations(indices)

In [11]:
generate_recommendation(train, test.ix[1:1,], x_test.ix[1:1,])

----------------------------------------------------------------------------------------
Thank you for providing your interests! Below are the summary of your interests

Headquarters:        dummy
Year founded:        2010
Company size:        2
Industry:            Food Business & Internet
------------------------------------------------------------------------------------------------------------------
We recommend to check 'Postmates' that matches your interests!

About the start up 

Postmates is transforming the way local goods move around a city by enabling anyone to get any product delivered in under an hour. Available for iPhone, Android and on the web, the on-demand logistics service connects customers with local couriers, who purchase and deliver goods from any restaurant or store in a city, 24/7.

Postmatesâ?? passionate community of riders and drivers are currently delivering in 40 major metropolitan markets. See the full list at postmates.com.

Download 