In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [18]:
data = pd.read_csv("article_after_processing10.csv", encoding='iso-8859-1')

# confirm there is no NAs
test = data.apply(lambda col: sum(col.isnull()), axis = 0)
test[test > 0] # diplay NA columns more than 0

funding_round    12
Specialties      33
City             46
State            46
latitude         46
longitude        46
dtype: int64

In [3]:
# prepare training data
data = data[data.Country == "United States"]

# reset index because we need to refer this dataframe to print recommendations
data = data.reset_index(drop=True)
cols = ['CompanyName', 'money_raised_float','Founded', 'CompanySize']
temp = data[cols]

x_train = data[['money_raised_float','Founded', 'CompanySize']]
data.shape

(178, 62)

In [4]:
##### Generate test data #################################
x_test = np.array([[10, 2015, 1],
                 [20, 2010, 2],
                [100, 2010, 4],
                  [200, 2013, 2]])

x_test = pd.DataFrame(x_test)
x_test.columns = list(x_train) # name columns as same to x_train columns
x_test

Unnamed: 0,money_raised_float,Founded,CompanySize
0,10,2015,1
1,20,2010,2
2,100,2010,4
3,200,2013,2


In [5]:
#### Utitlity functions ##################################

def convert_CompanySize(size):
    if size == 'Nov-50':
        return 1
    elif size == '51-200':
        return 2
    elif size == '201-500':
        return 3
    elif size == "501-1000":
        return 4
    elif size == "1001-5000":
        return 5
    elif size == "5001-10,000":
        return 6
    elif size == "10,001+":
        return 7
    else:
        return 0
    
def transform(x_train, x_test, companysize_transform =False, pandas_transform =True):
    col_names = list(x_train)
    
    ### transform x_train ##############
    # binning
    x_train.loc[:,"CompanySize"] = x_train.CompanySize.apply(lambda size: convert_CompanySize(size))
    
    # logarithmic transformation for money_raised_float because of some outliers
    x_train.loc[:,"money_raised_float"] = np.log(x_train["money_raised_float"])
    
    scaler = MinMaxScaler()
    transformed_x_train = scaler.fit_transform(x_train)
    
    ### transform x_test ###############
    # binning
    if companysize_transform:
        x_test.loc[:,"CompanySize"] = x_test.CompanySize.apply(lambda size: convert_CompanySize(size))
        
    # logarithmic transformation for money_raised_float because of some outliers
    x_test.loc[:,"money_raised_float"] = np.log(x_test["money_raised_float"])
    
    transformed_x_test = scaler.transform(x_test)
    
    ### tranform numpy to pandas ######
    
    if pandas_transform:
        transformed_x_train = pd.DataFrame(transformed_x_train)
        transformed_x_test = pd.DataFrame(transformed_x_test)
        transformed_x_train.columns = col_names
        transformed_x_test.columns = col_names
        
    return transformed_x_train, transformed_x_test

In [6]:
train, test = transform(x_train, x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
train.describe()

Unnamed: 0,money_raised_float,Founded,CompanySize
count,178.0,178.0,178.0
mean,0.247376,0.89153,0.273408
std,0.176857,0.091735,0.193665
min,0.0,0.0,0.0
25%,0.107813,0.871795,0.166667
50%,0.209492,0.910256,0.166667
75%,0.362341,0.935897,0.333333
max,1.0,1.0,1.0


In [8]:
test.describe()

Unnamed: 0,money_raised_float,Founded,CompanySize
count,4.0,4.0,4.0
mean,0.342458,0.935897,0.208333
std,0.317414,0.031404,0.209718
min,0.0,0.910256,0.0
25%,0.118856,0.910256,0.125
50%,0.342458,0.929487,0.166667
75%,0.56606,0.955128,0.25
max,0.684916,0.974359,0.5


In [9]:
list(data)

['title',
 'link',
 'excerpt',
 'published_at',
 'funding_round',
 'money_raised',
 'Company',
 'money_raised_float',
 'linkedin_link',
 'Company_at_Linkedin',
 'CompanyName',
 'Specialties',
 'Industry',
 'Website',
 'Location',
 'CompanySize',
 'Description',
 'Founded',
 'Also-viewed',
 'zip_code',
 'City',
 'address_check',
 'Country',
 'State',
 'latitude',
 'longitude',
 'Industry_consolidated',
 'spc_Logistics and Supply Chain',
 'spc_commerce',
 'spc_mobile',
 'spc_app',
 'spc_analysis',
 'spc_developer',
 'spc_security',
 'spc_social',
 'spc_ds',
 'spc_travel',
 'spc_booking_ticketing',
 'spc_Apparel',
 'spc_cloud',
 'spc_API',
 'spc_device',
 'spc_design',
 'spc_enterprise',
 'spc_robotics_manufacturing',
 'Computer & Network Security & Hardware',
 'Computer Software',
 'Consumer Electronics',
 'Consumers Goods & Services',
 'Education',
 'Entertainment',
 'Financial Services',
 'Food Business',
 'Healthcare_health',
 'Human Resources',
 'Information Technology and Services',

In [10]:
#### Utitlity functions ##################################
def train_predict(train_data, test_data, num_companies=6):
    model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    model_knn.fit(train_data)
    
    distances, indices = model_knn.kneighbors(test_data, n_neighbors = num_companies)

    return indices
#     return print (data.ix[indices[0]])
#     print (data.ix[indices[1]])
#     print (data.ix[indices[2]])
#     print (data.ix[indices[3]])



In [11]:
indices = train_predict(train, test.loc[1:2,])
indices

array([[ 58,  29,  25,  80,   5,  42],
       [ 98, 105,  23, 121,  86, 148]], dtype=int64)

In [12]:
test = data.ix[58, ]
test["CompanyName"]

'Entelo'

In [13]:
temp.ix[indices[0][1:],]

Unnamed: 0,CompanyName,money_raised_float,Founded,CompanySize
29,Duolingo,20.0,2011.0,51-200
25,"Conductor, Inc.",20.0,2008.0,51-200
80,Distil Networks,21.0,2011.0,51-200
5,"Fugue, Inc.",20.0,2013.0,51-200
42,Engine Yard,19.0,2006.0,51-200


In [14]:
def print_recommendations(indices):
    # uppack indices
    # first company to recommend
    first_co = data.ix[indices[[0]][0][0], ]
    print ("----------------------------------------------------------------------------------------")
    print ("We recommend to check '{}' that matches your interests!\n".format(first_co["CompanyName"]))
    print ("About the start up \n\n{}\n".format(first_co["Description"]))
    print ("Company details\n")
    print ("Website:             {}".format(first_co["Website"]))
    print ("Headquarters:        {}, {}".format(first_co["City"], first_co["State"]))
    print ("Year founded:        {:.0f}".format(first_co["Founded"]))
    print ("Company size:        {}".format(first_co["CompanySize"]))
    print ("Techcrunch article:  {}\n".format(first_co["link"]))
    print ("----------------------------------------------------------------------------------------")
    print ("We also suggest checking following startups\n")
    print (temp.ix[indices[0][1:],])
    print ("----------------------------------------------------------------------------------------")

In [15]:
print_recommendations(indices)

----------------------------------------------------------------------------------------
We recommend to check 'Entelo' that matches your interests!

About the start up 

Entelo empowers the modern recruiter. Through our data science, talent teams identify candidates who are most qualified and receptive to new opportunities, faster than anyone else. Today, over 600 customers gain a competitive advantage by trusting Entelo to provide their teams with actionable insights, higher candidate engagement, and significant time savings.

Company details

Website:             http://www.entelo.com
Headquarters:        San Francisco, CA
Year founded:        2011
Company size:        51-200
Techcrunch article:  https://techcrunch.com/2017/06/15/entelo-steps-up-its-ai-game/

----------------------------------------------------------------------------------------
We also suggest checking following startups

        CompanyName  money_raised_float  Founded CompanySize
29         Duolingo             