# Task for Today  

***

## Startup Funding Prediction  

Given *data about startups in India*, let's try to predict the **funding** provided to a given startup.

We will use a TensorFlow/Keras neural network within a scikit-learn pipeline to make our predictions.

# Getting Started

In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import tensorflow as tf

from sklearn.metrics import r2_score

In [14]:
data = pd.read_csv('startup_funding.csv')

In [15]:
data

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,
...,...,...,...,...,...,...,...,...,...,...
3039,3040,29/01/2015,Printvenue,,,,Asia Pacific Internet Group,Private Equity,4500000,
3040,3041,29/01/2015,Graphene,,,,KARSEMVEN Fund,Private Equity,825000,Govt backed VC Fund
3041,3042,30/01/2015,Mad Street Den,,,,"Exfinity Fund, GrowX Ventures.",Private Equity,1500000,
3042,3043,30/01/2015,Simplotel,,,,MakeMyTrip,Private Equity,,"Strategic Funding, Minority stake"


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


# Preprocessing

In [17]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop ID and high-cardinality columns
    df = df.drop(['Sr No', 'Startup Name', 'SubVertical', 'Investors Name'], axis=1)
    
    # Clean \\xc2\\xa0 examples
    df = df.applymap(lambda x: x.replace(r'\\xc2\\xa0', '') if isinstance(x, str) else x)
    
    # Clean target column
    df['Amount in USD'] = df['Amount in USD'].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x)
    df['Amount in USD'] = df['Amount in USD'].replace({
        'undisclosed': np.NaN,
        'unknown': np.NaN,
        'Undisclosed': np.NaN,
        'N/A': np.NaN,
        '14342000+': '14342000'
    })
    
    # Drop missing target rows
    df = df.dropna(subset=['Amount in USD']).reset_index(drop=True)
    
    # Drop columns with more than 25% missing values
    if 'Remarks' in df.columns:
        df = df.drop('Remarks', axis=1)
    
    # Fill categorical missing values with most frequent occurrence
    for column in ['Industry Vertical', 'City  Location', 'InvestmentnType']:
        if column in df.columns:
            df[column] = df[column].fillna(df[column].mode()[0])
    
    # Fix incorrect date formats
    df['Date dd/mm/yyyy'] = df['Date dd/mm/yyyy'].replace({
        '05/072018': '05/07/2018',
        '01/07/015': '01/07/2015',
        '22/01//2015': '22/01/2015'
    })

    # Convert date column with error handling
    df['Date dd/mm/yyyy'] = pd.to_datetime(df['Date dd/mm/yyyy'], dayfirst=True, errors='coerce')

    # Drop rows with invalid dates
    df = df.dropna(subset=['Date dd/mm/yyyy'])

    # Extract date features
    df['Year'] = df['Date dd/mm/yyyy'].dt.year
    df['Month'] = df['Date dd/mm/yyyy'].dt.month
    df['Day'] = df['Date dd/mm/yyyy'].dt.day
    df = df.drop('Date dd/mm/yyyy', axis=1)
    
    # Convert target column to float
    df['Amount in USD'] = df['Amount in USD'].astype(float)
    
    # Split df into X and y
    y = df['Amount in USD']
    X = df.drop('Amount in USD', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [18]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

  df = df.applymap(lambda x: x.replace(r'\\xc2\\xa0', '') if isinstance(x, str) else x)


In [19]:
X_train

Unnamed: 0,Industry Vertical,City Location,InvestmentnType,Year,Month,Day
1578,Online Student & Campus Social Networking plat...,Ahmedabad,Seed Funding,2015,10,26
924,eCommerce,Noida,Private Equity,2016,10,4
1108,Consumer Internet,Bangalore,Seed Funding,2016,7,21
1059,Consumer Internet,Mumbai,Private Equity,2016,8,29
160,Consumer Internet,Bengaluru,Seed/ Angel Funding,2018,8,8
...,...,...,...,...,...,...
960,eCommerce,Ahmedabad,Private Equity,2016,10,26
905,Consumer Internet,Mumbai,Private Equity,2016,11,24
1096,eCommerce,New Delhi,Seed Funding,2016,7,15
235,Finance,Chennai,Seed / Angel Funding,2018,5,2


In [20]:
y_train

1578     400000.0
924     4200000.0
1108     595000.0
1059    3000000.0
160     4000000.0
          ...    
960     1000000.0
905     4000000.0
1096     250000.0
235      450000.0
1061    1000000.0
Name: Amount in USD, Length: 1449, dtype: float64

# Building Pipeline

In [24]:
def build_model():
    inputs = tf.keras.Input(shape=(535,))
    x = tf.keras.layers.Dense(128, activation='relu')(inputs)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer='adam',
        loss='mse'
    )
    
    return model

In [25]:

from scikeras.wrappers import KerasRegressor
nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # ✅ Fixed here
])

# Column transformer
preprocessor = ColumnTransformer(transformers=[
    ('nominal', nominal_transformer, ['Industry Vertical', 'City  Location', 'InvestmentnType'])
], remainder='passthrough')

# Define regressor (Make sure `build_model` is defined)
regressor = KerasRegressor(build_model)

# Full pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', regressor)
])

# Training

In [None]:
model.fit(
    X_train,
    y_train,
    regressor__validation_split=0.2,
    regressor__batch_size=32,
    regressor__epochs=100,
    regressor__callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

# Extract the trained Keras model
keras_model = model.named_steps['regressor'].model_  # Extract only the Keras model

# Save the extracted Keras model
keras_model.save("model.h5")  # ✅ Now this will work!




Epoch 1/100


  return x.astype(dtype, copy=copy, casting=casting)


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 8533966336294912.0000 - val_loss: 29490098005344256.0000
Epoch 2/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 42836397492535296.0000 - val_loss: 29490095857860608.0000
Epoch 3/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 21770394654474240.0000 - val_loss: 29490089415409664.0000
Epoch 4/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 22513144823808000.0000 - val_loss: 29490076530507776.0000
Epoch 5/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 9421807480209408.0000 - val_loss: 29490048613220352.0000
Epoch 6/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 18541219198009344.0000 - val_loss: 29490014253481984.0000
Epoch 7/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 576669882851000



# Results

In [None]:

y_pred = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print("     Test RMSE: {:.2f}".format(rmse))

r2 = r2_score(y_test, y_pred)
print("Test R^2 Score: {:.5f}".format(r2))

  return x.astype(dtype, copy=copy, casting=casting)


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 12216002607579136.0000
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


AttributeError: 'super' object has no attribute '__sklearn_tags__'

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/K5NqUMZomYE