# Startup Funding Regression Project

In [1]:
!pip install fuzzywuzzy[speedup]



In [197]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MultiLabelBinarizer
from fuzzywuzzy import process, fuzz
from collections import Counter
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### For this project I decided to use kaggle´s Indian Startup database due to its rawness and the feature engineering needed to make ML ready.

In [198]:
df = pd.read_csv('indian_startup.csv')
df.head()

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,


In [199]:
df.loc[df['Investors Name'].str.contains('trinity', na=False, case=False), 'Investors Name']

2931    Trinity Ventures, InterWest Partners, Mohr Dav...
Name: Investors Name, dtype: object

### With a size of 30444 rows, it gives a decent sample size for a regression ML model. Another thing to note is all the 'object' type columns.

In [200]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


### First thing I look for are the NAN percentages, I immediately spot a column that becomes useless for this project.

In [201]:
df.isna().sum() / len(df) * 100

Sr No                 0.000000
Date dd/mm/yyyy       0.000000
Startup Name          0.000000
Industry Vertical     5.617608
SubVertical          30.749014
City  Location        5.913272
Investors Name        0.788436
InvestmentnType       0.131406
Amount in USD        31.537451
Remarks              86.235217
dtype: float64

### Since there is no way of filling NANs and due to the nature of the column, I decide to drop it.

In [202]:
df = df.drop(columns='Remarks')

### Next thing I notice is that the 'Amount' column is of 'object' type, so I look into which classes are mixed there.

In [203]:
df['Amount in USD'].apply(type).value_counts()

Amount in USD
<class 'str'>      2084
<class 'float'>     960
Name: count, dtype: int64

### If we look into the column, we notice that the numbers are formatted the Indian way, so Python takes them as strings.

In [204]:
df['Amount in USD'].head(10)

0    20,00,00,000
1       80,48,394
2     1,83,58,860
3       30,00,000
4       18,00,000
5       90,00,000
6    15,00,00,000
7       60,00,000
8     7,00,00,000
9     5,00,00,000
Name: Amount in USD, dtype: object

### I decide to write a function that cleans the strings and converts them to float type.

In [205]:
def clean_amount(value):
    if isinstance(value, str):
        try:
            return float(value.replace(',', ''))
        except ValueError:
            return np.nan
    elif isinstance(value, (int, float)):
        return value
    else:
        return np.nan
    

df['Amount in USD'] = df['Amount in USD'].apply(clean_amount)

### Here I hit one of the main stonewalls in this project, the SubVertital column is too inconsistent to be a feature, due to it being more of a 'business description' value rather than a categorical subvertical.

In [206]:
df['SubVertical'].head(20)

0                                   E-learning
1                    App based shuttle service
2        Retailer of baby and toddler products
3                            Online Investment
4                  Embroiled Clothes For Women
5     Open-market, freight management platform
6                Online Food Delivery Platform
7                                     Agritech
8                                   Automobile
9                      Satellite Communication
10            Logistics Services and Solutions
11                Food Solutions For Corporate
12    Online Meat And Seafood Ordering Startup
13               Non-Banking Financial Company
14               Experience Discovery Platform
15             Real money based gaming startup
16              Online Eyewear Shopping Portal
17      Business and customer engagement tools
18             Men's Health and Wellness brand
19                                   Elearning
Name: SubVertical, dtype: object

### That incosistency makes it so there are 1942 different values in a 3044 long database, 63,7% unique values. But there is potential here to create a valuable feature.

In [207]:
df['SubVertical'].nunique()

1942

### What I decide to go for is to create a new column, which will be my feature, and fill it with 'X' to mark the uncategorized ones.

In [208]:
df['Sub-Vertical'] = 'X'

### To handle the NANs here, due to it being a SubVertical, I decide to fill in with the business´s Vertical. 

In [209]:
df['SubVertical'] = df['SubVertical'].fillna(df['Industry Vertical'])

### And here comes the 'tedious' part, I´ll manually look at the database and recognize keywords and patterns so I can create my own subset of subverticals and recategorize the startups.

In [210]:
df.loc[(df['Sub-Vertical'] == 'X') & (df['SubVertical'].str.contains(r'web', regex=True, case=False, na=False)), ['Industry Vertical','SubVertical','Sub-Vertical']].tail(50)

Unnamed: 0,Industry Vertical,SubVertical,Sub-Vertical
206,Consumer Internet,Subscription-Only News Website,X
616,Technology,Omni-channel web commerce solutions provider,X
785,Consumer Internet,Online Rummy playing Website,X
1142,Consumer Internet,Self-Branded Websites & Apps for Ecommerce,X
1192,Technology,SaaS-based web optimisation and marketing plat...,X
1468,Consumer Internet,Wedding Website Creator,X
1477,Technology,Mobile App & Web Development company,X
1612,Consumer Internet,Web/Mobile Music Streaming service,X
1679,Consumer Internet,Web Video Streaming Platform,X
1957,Consumer Internet,Dynamic Restaurant Website creator,X


### After looking at the database, I noticed 2 scenarios:
1. A set of keywords, in which case I filter by that subset of words and replace with the new category.
2. Descriptions were there is no keyword, but the context itself gives away which category it falls into, in this case I manually recategorize each one using its index.

In [211]:
def classify(rows, label):
    df.loc[rows, 'Sub-Vertical'] = label

df.loc[df['SubVertical'].str.contains('health|medic|doctor|clinic', case=False, na=False), 'Sub-Vertical'] = 'Health Related Services'
classify([21,28,71,99,117,151,154,161,237,304,197], 'Health Related Services')

df.loc[df['SubVertical'].str.contains('diagnos', case=False, na=False),'Sub-Vertical'] = 'Diagnostics & Medical Testing'

df.loc[df['SubVertical'].str.contains('pharma', case=False, na=False), 'Sub-Vertical'] = 'Pharmacy'

# Here I first filter by 'food' and label all of them as 'Food Related Services'.Then if they include 'food delivery' or other, I recategorize as 'Grocery / Food Delivery' 
df.loc[df['SubVertical'].str.contains('beverages|meat|food', case=False, na=False), 'Sub-Vertical'] = 'Food Related Services'
classify([6,11,12,20,56,59,66,53,126,130,1856],'Food Related Services')
df.loc[df['SubVertical'].str.contains('food delivery|food ordering|grocer', case=False, na=False), 'Sub-Vertical'] = 'Grocery / Food Delivery Services'
classify([132,190,181,249,649], 'Grocery / Food Delivery Services')

df.loc[df['SubVertical'].str.contains('car rent|carpool|bike rent|taxi|bicycle|drivers|online car| cab |cab ', case=False, na=False), 'Sub-Vertical'] = 'Transportation Rental Services'
classify([1,26,29,64,60,87,89,110,133,172,183,247,2099,621,221,721],'Transportation Rental Services')

df.loc[df['SubVertical'].str.contains(r'(?=.*\belectric\b)(?=.*\bmanufacturer\b)', regex=True, case=False, na=False), 'Sub-Vertical'] = 'Electric Vehicle Manufacturer'

classify([8,44,46,50,61,94,116,134,147,205,724,1023,2102,679,1081,1258],'Automotive Related Services')

classify([236,692,849,1165,1180,1196,1418,1492,1518,1877,1951], 'Doorstep Delivery Services')

df.loc[df['SubVertical'].str.contains('logistic|supply chain|truck', case=False, na=False), 'Sub-Vertical'] = 'Logistic & Supply Chain Related Services'
classify([5,10,32,45,69,92,112,11,145,111,152,149,150,253], 'Logistic & Supply Chain Related Services')

df.loc[df['SubVertical'].str.contains('healthc|medic|doctor|clinic', case=False, na=False), 'Sub-Vertical'] = 'Health Related Services'
classify([21,28,71,99,117,151,154,161,237,304,197], 'Health Related Services')

df.loc[df['SubVertical'].str.contains('diagnos', case=False, na=False),'Sub-Vertical'] = 'Diagnostics & Medical Testing'

df.loc[df['SubVertical'].str.contains('pharma', case=False, na=False), 'Sub-Vertical'] = 'Pharmacy'

df.loc[df['SubVertical'].str.contains('fitness|wellness| spa |lifestyle', case=False, na=False), 'Sub-Vertical'] = 'Fitness & Wellness Related Services'
classify([18,42,93,34,102,136,222,224,2098], 'Fitness & Wellness Related Services')

df.loc[df['SubVertical'].str.contains('baby', case=False, na=False), 'Sub-Vertical'] = 'Baby Supplies'
classify([2,159], 'Baby Supplies')

df.loc[df['SubVertical'].str.contains('fashion', case=False, na=False), 'Sub-Vertical'] = 'Fashion Retail'
classify([4,72,91,105,113,144,1599], 'Fashion Retail')

df.loc[df['SubVertical'].str.contains('eyewear', case=False, na=False), 'Sub-Vertical'] = 'Eyewear Retail'
classify([16], 'Eyewear Retail')

df.loc[df['SubVertical'].str.contains(r'(?=.*\blearning\b)(?=.*\bplatform\b)', regex=True, case=False, na=False), 'Sub-Vertical'] = 'E-learning'
df.loc[df['SubVertical'].str.contains(r'(?=.*\blearning\b)(?=.*\bapp\b)', regex=True, case=False, na=False), 'Sub-Vertical'] = 'E-learning'
df.loc[df['SubVertical'].str.contains(r'(?=.*\bkids\b)(?=.*\blearning\b)', regex=True, case=False, na=False), 'Sub-Vertical'] = 'E-learning'
df.loc[df['SubVertical'].str.contains('online certi', regex=True, case=False, na=False), 'Sub-Vertical'] = 'E-Learning'
classify([0,19,73,104,106,122,174,255,285,2024,2019,2010,2007,1916,1880,1868,1801,1756,1696,1601,1540,1469,1139,1005,983,920,634,483,479,441,371,370,290,285,2000], 'E-learning')

df.loc[df['SubVertical'].str.contains('tutor|coach', regex=True, case=False, na=False), 'Sub-Vertical'] = 'Coach & Tutoring Services'

df.loc[df['SubVertical'].str.contains('education|ed-tech|school|skill', regex=True, case=False, na=False), 'Sub-Vertical'] = 'Education'
classify([30,38,67,137,57,2064,2028,1827,1682,854,418,412,406,384,269,2020,316], 'Education')

df.loc[df['SubVertical'].str.contains('game|gaming', case=False, na=False), 'Sub-Vertical'] = 'Gaming'
classify([15,24,129,158], 'Gaming')

df.loc[df['SubVertical'].str.contains('agri-|agriculture|farmer', case=False, na=False), 'Sub-Vertical'] = 'Agritech'
classify([7,22,81,95,591], 'Agritech')

df.loc[df['SubVertical'].str.contains('artificial int', case=False, regex=True, na=False), 'Sub-Vertical'] = 'AI Related'
df.loc[df['SubVertical'].str.contains(r'\bAI\b', case=False, regex=True, na=False), 'Sub-Vertical'] = 'AI Related'
classify([80,65,51,141,157,177,178,182,189,254], 'AI Related')

df.loc[df['SubVertical'].str.contains('wealth|invest|stock', case=False, na=False), 'Sub-Vertical'] = 'Investment/Wealth Management'
classify([98,55,47,3,188,220,250,246,238], 'Investment/Wealth Management')

df.loc[df['SubVertical'].str.contains('lending|loan|financ|payment|wallet', case=False, na=False), 'Sub-Vertical'] = 'Banking/Payments/Financial Services'
classify([78,77,103,86,109,85,13,23,54,31,35,36,162,163,173,175,165,184,225,251,257,1526,1296,495,698,1210,1903,2147], 'Banking/Payments/Financial Services')
classify([118,127,138,142,223,230,215,841,798,819], 'Non Banking Financial Services')

df.loc[df['SubVertical'].str.contains('ecommerce', case=False, na=False), 'Sub-Vertical'] = 'Ecommerce'
classify([131,128,124,123,49,96,27,169,176,191], 'Ecommerce')

df.loc[df['SubVertical'].str.contains('property|real estate', case=False, na=False), 'Sub-Vertical'] = 'Real Estate'
classify([125,74], 'Real Estate')

df.loc[df['SubVertical'].str.contains('travel|hotel', case=False, na=False), 'Sub-Vertical'] = 'Travel Related Services'
classify([155,121,101,170,219], 'Travel Related Services')
classify([146,97,75,2008,1316], 'Wine & Beer')

df.loc[df['SubVertical'].str.contains('marketing', case=False, na=False), 'Sub-Vertical'] = 'Marketing'
classify([107,37,], 'Marketing')

df.loc[df['SubVertical'].str.contains('robotic', case=False, na=False), 'Sub-Vertical'] = 'Robotics'

df.loc[df['SubVertical'].str.contains('video', case=False, na=False), 'Sub-Vertical'] = 'Video Content & Streaming Platforms'

df.loc[df['SubVertical'].str.contains('insurance', case=False, na=False), 'Sub-Vertical'] = 'Insurance / InsurTech'

df.loc[df['SubVertical'].str.contains('communic', case=False, na=False), 'Sub-Vertical'] = 'Communication Platforms'

df.loc[df['SubVertical'].str.contains('media|news|entertainment', case=False, na=False), 'Sub-Vertical'] = 'Digital Media / Content'

df.loc[df['SubVertical'].str.contains('SaaS|Software', case=False, na=False),'Sub-Vertical'] = 'Software / SaaS'

df.loc[df['SubVertical'].str.contains('data analy|analy|data', case=False, na=False),'Sub-Vertical'] = 'Data Science/Analytics'

df.loc[df['SubVertical'].str.contains('spaces', case=False, na=False),'Sub-Vertical'] = 'Co-Working & Co-Living Spaces'

df.loc[df['SubVertical'].str.contains('career|job', case=False, na=False),'Sub-Vertical'] = 'Career & Recruitment Services'

df.loc[df['SubVertical'].str.contains('fund', case=False, na=False),'Sub-Vertical'] = 'Crowdfunding / Fundraising Platforms'

df.loc[df['SubVertical'].str.contains('matchm|dating', case=False, na=False),'Sub-Vertical'] = 'Dating Platforms'

df.loc[df['SubVertical'].str.contains('home', case=False, na=False),'Sub-Vertical'] = 'Home Related Services'

df.loc[df['SubVertical'].str.contains('cybersecurity|cyber security|security', case=False, na=False),'Sub-Vertical'] = 'Cybersecurity'

df.loc[df['SubVertical'].str.contains('tea |herbal', case=False, na=False),'Sub-Vertical'] = 'Tea Sales'

df.loc[df['SubVertical'].str.contains('social network', case=False, na=False),'Sub-Vertical'] = 'Social Networks'

df.loc[df['SubVertical'].str.contains('beauty', case=False, na=False),'Sub-Vertical'] = 'Beauty Related Services'

df.loc[df['SubVertical'].str.contains('waste|recycl', case=False, na=False),'Sub-Vertical'] = 'Waste/Recycling Related Services'

df.loc[df['SubVertical'].str.contains('business', case=False, na=False),'Sub-Vertical'] = 'Business Development'

df.loc[df['SubVertical'].str.contains('restaurant', case=False, na=False),'Sub-Vertical'] = 'Restaurant Tech & Services'

df.loc[df['SubVertical'].str.contains('pet', case=False, na=False),'Sub-Vertical'] = 'Pet Related Services'

df.loc[df['SubVertical'].str.contains('customer|b2c', case=False, na=False),'Sub-Vertical'] = 'Customer Support/Engagement'

df.loc[df['SubVertical'].str.contains(r'(?=.*\bonline\b)(?=.*\bfurniture\b)', regex=True, case=False, na=False),'Sub-Vertical'] = 'Furniture Retail & Rent'

df.loc[df['SubVertical'].str.contains('app dev', case=False, na=False),'Sub-Vertical'] = 'App Development' # esta categoria se puede ampliar

df.loc[df['SubVertical'].str.contains('solar|energy', case=False, na=False),'Sub-Vertical'] = 'Energy'

df.loc[df['SubVertical'].str.contains('cleani', case=False, na=False),'Sub-Vertical'] = 'Cleaning Services'

df.loc[df['SubVertical'].str.contains('hyperlocal|handyman', case=False, na=False),'Sub-Vertical'] = 'Hyperlocal Services'

df.loc[df['SubVertical'].str.contains('jewel', case=False, na=False),'Sub-Vertical'] = 'Jewellery'
# e books

### To help visualize progress, I created a percentage and rows counter.

In [212]:
# USE THIS CELL ONLY FOR PARTIAL RUNS!!!
df.loc[df['Sub-Vertical'] == 'X', 'Sub-Vertical'] = df['SubVertical']

In [213]:
progress = (df['Sub-Vertical'] != 'X').mean() * 100
rows = (df['Sub-Vertical'] != 'X').sum()

print(f"{progress:.1f}% done")
print(f'{rows} rows done')

100.0% done
3044 rows done


### Next, I plan to do the same with the Vertical column, but due to it having way clearer categories, I´ll wait until after running the model and seeing the scores.

In [214]:
df['Industry Vertical'].value_counts()

Industry Vertical
Consumer Internet             941
Technology                    478
eCommerce                     186
Healthcare                     70
Finance                        62
                             ... 
Startup Analytics platform      1
Mobile Food Ordering app        1
Financial Markets Software      1
Hiring Analytics platform       1
Flat rental Mobile App          1
Name: count, Length: 821, dtype: int64

### The other main wall I hit is the 'Investor Name' column. This column is a set of strings written in inconsistent form, mentioning the investors.
### 2 problems here:
1. The name of the investors is written differently (letter cases, spelling, spaced, separation symbols).
2. It is not a list, so each row is a string and we cant access the investors so the model can learn information about them.

In [215]:
df['Investors Name'].value_counts()

Investors Name
Undisclosed Investors                                                                                                                                             39
Undisclosed investors                                                                                                                                             30
Ratan Tata                                                                                                                                                        25
Indian Angel Network                                                                                                                                              23
Kalaari Capital                                                                                                                                                   16
                                                                                                                                                                

In [216]:
df['Investors Name'].nunique()

2412

### To get a clean set of investors I go with 3 steps:
1. Clean and standarize the strings so I can turn them into lists.
2. Remove redudants words like 'Capital' or 'India' that may mislead fuzzywuzzy when mapping the values.
3. Use fuzzywuzzy´s toke_set_ratio to get the list of unique investors and then map them.

In [217]:
# group|limited|ltd|inc|corp|corporation|company|co|management|

In [218]:
df['Investors Name'] = df['Investors Name'].fillna('Undisclosed')

redundant_words = r'\b(india|capital|partners?|ventures?|holdings?|advisors?)\b'

df['Investors List'] = df['Investors Name'].str.lower().str.replace(' and ',' , ').str.replace(' & ',' , ').str.replace(redundant_words,'', regex=True).str.split(',').apply(lambda x: [i.strip() for i in x])

In [141]:
all_investors = set([i for sublist in df['Investors List'] for i in sublist])

canonical_names = {}
for inv in all_investors:
    if canonical_names:  
        result = process.extractOne(inv, canonical_names.keys(), scorer=fuzz.token_set_ratio)
        if result:  
            match, score = result
            if score >= 70:
                canonical_names[inv] = match
            else:
                canonical_names[inv] = inv
        else:  
            canonical_names[inv] = inv
    else:
        canonical_names[inv] = inv  


### After using fuzzywuzzy I take a look into the results to see if it is good enough for mapping the database.

In [219]:
df1 = pd.DataFrame(canonical_names.items(), columns=['name','canon'])
df1[df1['canon'] == ''] # The '' represents a fund composed by only redundant words: the 'India Capital', so this mapping correclty identifies it.

Unnamed: 0,name,canon
0,,


### After looking at fuzzywuzzy´s result dictionary and being ok with it, I proceed to map the database´s values and running a value_counts to see the distribution.

In [220]:
df['Investors List'] = df['Investors List'].apply(lambda lst: [canonical_names[i] for i in lst])

investors = [inv for sublist in df['Investors List'] for inv in sublist]
counts = Counter(investors)
pd.DataFrame(counts.items(), columns=['Investor','Count']).sort_values('Count', ascending=False)

Unnamed: 0,Investor,Count
28,others from letsventure,211
18,undisclosed existing investors as well as the ...,133
2,sequoia .,123
24,accel (formerly known as accel ),94
107,,79
...,...,...
1761,trinity,1
1760,tano,1
14,nptk,1
10,ravikanth reddy,1


### In order to not overload the features and flood the model with a high number of dummies, I take the main investors and map the rest as 'Rare'.

In [234]:
counts_s = pd.Series(counts).sort_values(ascending=False)
counts_s[counts_s > 5]

others from letsventure                                                211
undisclosed existing investors as well as the tamarind family trust    133
sequoia .                                                              123
accel (formerly known as accel )                                        94
                                                                        79
                                                                      ... 
entrepreneurship (ciie)                                                  7
hero enterprise                                                          7
sunil goyal ashish gupta                                                 7
multiple investors                                                       7
iifl                                                                     7
Length: 178, dtype: int64

In [235]:
main_investors = counts_s[counts_s > 4]

df['Investors List'] = df['Investors List'].apply(lambda inv_list: [inv if inv in main_investors else 'Rare' for inv in inv_list])

In [236]:
investors = [inv for sublist in df['Investors List'] for inv in sublist]
new_counts = Counter(investors)
pd.DataFrame(new_counts.items(), columns=['Investor','Count']).sort_values('Count', ascending=False)

Unnamed: 0,Investor,Count
1,Rare,1494
17,others from letsventure,130
2,sequoia .,105
11,undisclosed existing investors as well as the ...,101
15,accel (formerly known as accel ),78
...,...,...
230,rajeev arora,1
239,kamal kothari,1
256,micromax,1
264,anurag gupta,1


### Since models cannot read lists, I use a MultiLabelBinarizer to include the investor dummies in the main df.

In [237]:
mlb = MultiLabelBinarizer()

investor_dummies = mlb.fit_transform(df['Investors List'])

investor_columns = pd.DataFrame(investor_dummies, columns=mlb.classes_, index=df.index)

df = pd.concat([df, investor_columns], axis=1)


### Since the signal given by a startup having more then one 'Rare' investor is lost in the dummies, I decided to create a 'Rare' investor count feature.

In [238]:
df['Rare Investor Count'] = df['Investors List'].apply(lambda inv_list: sum(1 for inv in inv_list if inv not in main_investors))

### FInally, I drop NANs before creating my testing and training sets.

In [239]:
df = df.dropna()

In [240]:
df.isna().sum() / len(df) * 100

Sr No                            0.0
Date dd/mm/yyyy                  0.0
Startup Name                     0.0
Industry Vertical                0.0
SubVertical                      0.0
                                ... 
y combinator                     0.0
yournest angel fund\\xc2\\xa0    0.0
youwecan                         0.0
zeeshan hayat                    0.0
zodius                           0.0
Length: 558, dtype: float64

### For this regression project, our sets are as follows:
1. Features: Industry Vertical, Industry SubVertical, Investors, Number of Investors, Investing type and Location.
2. Target: Amount of USD invested in the startup.

In [241]:
df.columns

Index(['Sr No', 'Date dd/mm/yyyy', 'Startup Name', 'Industry Vertical',
       'SubVertical', 'City  Location', 'Investors Name', 'InvestmentnType',
       'Amount in USD', 'Sub-Vertical',
       ...
       'vishal jain', 'vy', 'warburg pincus', 'westbridge', 'white unicorn',
       'y combinator', 'yournest angel fund\\xc2\\xa0', 'youwecan',
       'zeeshan hayat', 'zodius'],
      dtype='object', length=558)

In [242]:
X = pd.get_dummies(df.drop(columns=['Sr No','Date dd/mm/yyyy','Startup Name','Amount in USD','SubVertical','Investors Name','Investors List']))
y = df['Amount in USD']
y = np.log1p(y) # applied a log1p transformation to the target to reduce skewness and stabilize variance.

random_state=42
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=random_state)

In [243]:
train = X_train.shape
test = X_test.shape

print(f'Train Set Shape: {train}')
print(f'Test Set Shape: {test}')

Train Set Shape: (1541, 1857)
Test Set Shape: (386, 1857)


### I decide to go with a RandomForestRegressor through a RandomizedSearchCV to find the best parameters.

In [244]:
rf = RandomForestRegressor(random_state=random_state)

param_grid = {
    'n_estimators': [100, 150, 200],     
    'max_depth': [5, 10, 15, 20, 25],       
    'min_samples_split': [2, 5, 10, 15],          
    'min_samples_leaf': [1, 2, 4, 6],             
    'max_features': ['sqrt', 'log2', 0.7, 0.8],   
    'bootstrap': [True, False]                   
}

rfs = RandomizedSearchCV(rf, 
                        param_distributions=param_grid, 
                        cv=5, 
                        n_iter=30,
                        random_state=random_state,
                        verbose=0,
                        scoring=['neg_root_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
                        refit='neg_root_mean_squared_error'
                        )

rfs.fit(X_train, y_train)

In [195]:
params = rfs.best_params_
params

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 0.7,
 'max_depth': 15,
 'bootstrap': True}

### As the main objective, I´m looking for a r2 score of 0.7 - 0.8 at minimum and a difference of < 0.12 between train and test sets. Plus a RMSE as low as possible (1M - 10m ballpark).

In [245]:
train_pred = rfs.predict(X_train)
test_pred = rfs.predict(X_test)

trmse = np.sqrt(mean_squared_error(np.expm1(y_train), np.expm1(train_pred)))
tmae = mean_absolute_error(np.expm1(y_train), np.expm1(train_pred))
tr2 = r2_score(y_train, train_pred)

rmse = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(test_pred)))
mae = mean_absolute_error(np.expm1(y_test), np.expm1(test_pred))
r2 = r2_score(y_test, test_pred)

print(f'64% SV + investors list (count > 6) + "rare" count TRAIN SCORES:')
print(f'RMSE: {trmse:.2f}')
print(f'MAE: {tmae:.2f}')
print(f'R2: {tr2:.2f}')
print('----------------------')
print(f'64% SV + investor list (count > 6) + "rare" count TEST SCORES:')
print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R2: {r2:.2f}')

64% SV + investors list (count > 6) + "rare" count TRAIN SCORES:
RMSE: 130341585.24
MAE: 15588298.96
R2: 0.77
----------------------
64% SV + investor list (count > 6) + "rare" count TEST SCORES:
RMSE: 62737896.19
MAE: 11636428.40
R2: 0.63


### Finally I decided to take a look into the results of the predictions.

In [136]:
y_pred = rfs.predict(X_test)
true_y_pred = np.expm1(y_pred)

results = pd.DataFrame({
    'Actual': np.expm1(y_test),
    'Predicted': true_y_pred
})

results['Error'] = results['Actual'] - results['Predicted']
results['Actual'] = results['Actual'].map('{:,.0f}'.format)
results['Predicted'] = results['Predicted'].map('{:,.0f}'.format)
results['Error'] = results['Error'].map('{:,.0f}'.format)

results

Unnamed: 0,Actual,Predicted,Error
86,52000000,6042442,45957558
586,250000000,5255493,244744507
423,50000000,5226847,44773153
772,5000000,5261726,-261726
1859,1000000,244615,755385
...,...,...,...
533,2000000,5192332,-3192332
687,3000000,5261726,-2261726
419,7400000,7825210,-425210
1682,1000000,5213489,-4213489
