In [2]:
import pandas as pd
import numpy as np
import warnings

import seaborn as sns
import matplotlib.pyplot as plt

#Plotly Libraris
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots
import plotly.io as pio



warnings.filterwarnings('ignore')

pio.templates.default = "ggplot2"

#dataframe display settings
pd.set_option('display.max_columns', 5000000)
pd.set_option('display.max_rows', 50000000)

In [3]:
# LOAD ALL THE DATA
## Data from dataset 1
acquisitions = pd.read_csv('data/acquisitions.csv')
funding_rounds = pd.read_csv('data/funding_rounds.csv')
funds = pd.read_csv('data/funds.csv')
investments = pd.read_csv('data/investments.csv')
ipos = pd.read_csv('data/ipos.csv')
milestones = pd.read_csv('data/milestones.csv')
objects = pd.read_csv('data/objects.csv')
offices = pd.read_csv('data/offices.csv')
people = pd.read_csv('data/people.csv')
relationships = pd.read_csv('data/relationships.csv')

## Data from dataset 2
founders = pd.read_csv('data/dataset2.csv')

## Data from dataset 3
industry_trends = pd.read_csv('data/dataset3.csv')
industry_trends.rename(columns={"Unnamed: 0": "StartupID", "Unnamed: 6": "City_ZIP"}, inplace=True)
## Data from dataset 4
startup_info = pd.read_csv('data/dataset4.csv')


In [4]:
# To begin with we will start using the dataset 4 since it is the most simplified one
startup_info.head(5)

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,/organization/-fame,#fame,http://livfame.com,Media,10000000,operating,IND,16,Mumbai,Mumbai,1,,2015-01-05,2015-01-05
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014-03-01,2014-10-14
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878,operating,,,,,1,,2014-01-30,2014-01-30
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,2000000,operating,CHN,22,Beijing,Beijing,1,2007-01-01,2008-03-19,2008-03-19
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,-,operating,USA,IL,"Springfield, Illinois",Champaign,1,2010-01-01,2014-07-24,2014-07-24


In [5]:
startup_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66368 entries, 0 to 66367
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   permalink          66368 non-null  object
 1   name               66367 non-null  object
 2   homepage_url       61310 non-null  object
 3   category_list      63220 non-null  object
 4   funding_total_usd  66368 non-null  object
 5   status             66368 non-null  object
 6   country_code       59410 non-null  object
 7   state_code         57821 non-null  object
 8   region             58338 non-null  object
 9   city               58340 non-null  object
 10  funding_rounds     66368 non-null  int64 
 11  founded_at         51147 non-null  object
 12  first_funding_at   66344 non-null  object
 13  last_funding_at    66368 non-null  object
dtypes: int64(1), object(13)
memory usage: 7.1+ MB


In [6]:
startup_info['founded_year'] = pd.to_datetime(startup_info['founded_at'], errors = 'coerce').dt.year
startup_info['funding_total_usd'].replace('-', 0, inplace=True)
startup_info['funding_total_usd'] = startup_info['funding_total_usd'].astype(float)
startup_info.describe()

Unnamed: 0,funding_total_usd,funding_rounds,founded_year
count,66368.0,66368.0,51143.0
mean,14918920.0,1.732522,2007.762294
std,169093000.0,1.360251,10.302143
min,0.0,1.0,1749.0
25%,40000.0,1.0,2006.0
50%,1000000.0,1.0,2010.0
75%,6800473.0,2.0,2013.0
max,30079500000.0,19.0,2105.0


In [7]:
startup_info = startup_info.drop_duplicates()
startup_info.shape

(66368, 15)

In [8]:
startup_info.isna().sum()

permalink                0
name                     1
homepage_url          5058
category_list         3148
funding_total_usd        0
status                   0
country_code          6958
state_code            8547
region                8030
city                  8028
funding_rounds           0
founded_at           15221
first_funding_at        24
last_funding_at          0
founded_year         15225
dtype: int64

In [9]:
# Cleanning dataset
clean_startup_info = startup_info.drop(['permalink', 'homepage_url'], axis=1)

# Remove the NaN name
clean_startup_info.dropna(how='any', subset=['name'], axis=0, inplace=True)

#Extracting year value from "first_funding_at" and changing to int
clean_startup_info['first_funding_at'] = clean_startup_info.first_funding_at.str.split("-").str[0]
clean_startup_info['first_funding_at'] = clean_startup_info['first_funding_at'].astype(float)
#Extracting year value from "last_funding_at" and changing to int
clean_startup_info['last_funding_at'] = clean_startup_info.last_funding_at.str.split("-").str[0]
clean_startup_info['last_funding_at'] = clean_startup_info['last_funding_at'].astype(float)

# Treat Nan category
clean_startup_info['category_list'] = clean_startup_info['category_list'].replace(np.nan, 'other')
clean_startup_info.head()

Unnamed: 0,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at,founded_year
0,#fame,Media,10000000.0,operating,IND,16,Mumbai,Mumbai,1,,2015.0,2015.0,
1,:Qounter,Application Platforms|Real Time|Social Network...,700000.0,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014.0,2014.0,2014.0
2,"(THE) ONE of THEM,Inc.",Apps|Games|Mobile,3406878.0,operating,,,,,1,,2014.0,2014.0,
3,0-6.com,Curated Web,2000000.0,operating,CHN,22,Beijing,Beijing,1,2007-01-01,2008.0,2008.0,2007.0
4,004 Technologies,Software,0.0,operating,USA,IL,"Springfield, Illinois",Champaign,1,2010-01-01,2014.0,2014.0,2010.0


In [10]:
# Check false entries
filter_0 = len(clean_startup_info[clean_startup_info['first_funding_at']>clean_startup_info['last_funding_at']])
filter_1 = len(clean_startup_info[clean_startup_info['first_funding_at']<1749])
total = filter_0 + filter_1
print(f'There are {total} entries that are wrong by common sense')
print(clean_startup_info[clean_startup_info['first_funding_at']<1749][['name', 'founded_at', 'first_funding_at']])
clean_startup_info.drop(clean_startup_info[clean_startup_info['first_funding_at']<1749].index, inplace=True)



There are 3 entries that are wrong by common sense
                   name  founded_at  first_funding_at
7409   Blaze Bioscience  2010-01-01            1532.0
22208         Gamewheel         NaN            1201.0
37176           Mousera  2013-01-01            1014.0


In [11]:
startups_history = clean_startup_info.groupby('founded_year').size().reset_index(name='#Startups')
funding_history = clean_startup_info.groupby('first_funding_at').size().reset_index(name='#Startups')

In [12]:
# Plot the distribution using Plotly Express with a line plot
fig = px.area(startups_history, x='founded_year', y='#Startups', 
              title='Distribution of Startups by Founded Year and First funding year')

fig.add_trace(go.Scatter(x=startups_history['founded_year'], 
                         y=startups_history['#Startups'], 
                         fill='tozeroy',  # Filling area below the line
                         mode='none', 
                         fillcolor='rgba(255,0,0,0.4)',  # Adjust color as needed
                         name='Founded year')) 

# Add an area chart for the second DataFrame
fig.add_trace(go.Scatter(x=funding_history['first_funding_at'], 
                         y=funding_history['#Startups'], 
                         fill='tozeroy',  # Filling area below the line
                         mode='none', 
                         fillcolor='rgba(0,176,246,0.3)',  # Adjust color as needed
                         name='First funding Year'))  # Legend label

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=10,
                     label="10y",
                     step="year",
                     stepmode="todate"),
                dict(count=20,
                     label="20y",
                     step="year",
                     stepmode="todate"),
                dict(count=50,
                     label="50y",
                     step="year",
                     stepmode="todate"),
                dict(count=100,
                     label="100y",
                     step="year",
                     stepmode="todate"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

fig.show()

In [13]:
fig_country = go.Figure()


# Define a custom greens colorscale with shades of green
colorscale = [[i / 15, f'rgba(0, {int(176 - (i * (176 / 15)))}, 0, {0.5 + (i / 30)})'] for i in range(15)]

# Create the bar plot
fig_country = go.Figure(go.Bar(
    x=clean_startup_info['country_code'].value_counts().index[:15],
    y=clean_startup_info['country_code'].value_counts()[:15],
    hovertemplate='<br><b>Country</b>: %{x}'+'<br><i>Startup count</i>: %{y}',
    marker=dict(color=clean_startup_info['country_code'].value_counts()[:15], colorscale=colorscale)
))


fig_country.update_layout(title="Number of Startups in each Country",
                             xaxis_title="Country", yaxis_title="Startup Count",title_x=0.5, paper_bgcolor="silver",
                             title_font_size=20)
fig_country.show()


In [22]:
fig_country_map = go.Figure()
fig_country_map.add_trace(go.Choropleth(locations=clean_startup_info['country_code'].value_counts().index,
                                       z=clean_startup_info['country_code'].value_counts(),
                                       colorscale='greens',
                                       colorbar_title="Nos. of Startups founded",
                                       ))
fig_country_map.update_layout(title_text="Number of Startups by Country", title_x=0.5, title_font_size=20)
fig_country_map.show()

In [23]:
# The most radical approach is to consider a successful startup that one that is considered a Unicorn

startup_unicorns = ['Uber', 'Google', 'Alibaba', 'Apple',
                    'Amazon', 'Airbnb', 'Facebook', 'PayPal', 
                    'Xiaomi', 'Pinterest', 'Coursera', 'Stripe']

color = ['Black','Orange','Blue','Darkblue', 
         'lightblue', 'darkorange','teal', 'red',
         'lightgreen','orange','cyan','pink']

unicorn_startups = clean_startup_info.set_index('name', drop=True)

unicorn_founding_year = []
total_funding=[]
#Extracting its details from the dataset
for i in startup_unicorns:
    unicorn_founding_year.append(int(unicorn_startups.loc[i]['founded_year']))
    total_funding.append(int(unicorn_startups.loc[i]['funding_total_usd']))    
df_unicorns = pd.DataFrame(list(zip(startup_unicorns, unicorn_founding_year, total_funding, color)),
                           columns=['Unicorn name', 'Founding year','Total funding','Color']).sort_values(by='Total funding')
df_unicorns.head()


Unnamed: 0,Unicorn name,Founding year,Total funding,Color
3,Apple,1976,250000,Darkblue
4,Amazon,1994,8000000,lightblue
1,Google,1998,26100000,Orange
10,Coursera,2012,146100000,cyan
7,PayPal,1998,197000000,red


In [24]:
fig_unicorn_founded = go.Figure()
fig_unicorn_founded.add_trace(go.Histogram(x=clean_startup_info[(clean_startup_info['founded_year']>1970) & (clean_startup_info['founded_year']<2025)]['founded_year'], name="Founded year", marker=dict(opacity=0.9),
                                          hovertemplate = '<br><b>Year</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                                          ))

for i in df_unicorns['Unicorn name']:
    fig_unicorn_founded.add_shape(
            # Line Vertical
            dict(
                type="line",
                xref="x",
                yref="paper",
                x0=str(list(df_unicorns[df_unicorns['Unicorn name']==i]['Founding year'])[0]),
                y0=0,
                x1=str(list(df_unicorns[df_unicorns['Unicorn name']==i]['Founding year'])[0]),
                y1=1,
                line=dict(
                    color=str(list(df_unicorns[df_unicorns['Unicorn name']==i]['Color'])[0]),
                    width=1
                )
    ))
    
fig_unicorn_founded.add_trace(go.Scatter(x=df_unicorns['Founding year'],
                                        y=df_unicorns.index*700+300, mode="text", text=df_unicorns[['Unicorn name']], 
                                        textfont=dict(family="sans serif",size=15), showlegend=False,
                                        hovertemplate = '<br><b>Company</b>: %{text}'+'<br><i>Founding Year</i>: %{x}'
                                        ))

fig_unicorn_founded.update_layout(title="Years in which some Startup Unicorns were founded",
                             xaxis_title="Year", yaxis_title="Startup Count",title_x=0.5, paper_bgcolor="mintcream",
                             title_font_size=20,legend=dict(orientation='h',yanchor='top',y=1.1,xanchor='right',x=1))
fig_unicorn_founded.show()

In [25]:
categories_list = clean_startup_info['category_list'].str.split('|', expand=True).stack()

fig_market = go.Figure()
fig_market.add_trace(go.Bar(x=categories_list.value_counts().index[:30], y=categories_list.value_counts()[:30],
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}',
                           marker=dict(color=list(range(30)), colorscale="Sunsetdark")))

fig_market.update_layout(title="Number of Startups in each Market",
                             xaxis_title="Market", yaxis_title="Startup Count",title_x=0.5, paper_bgcolor="mintcream",
                             title_font_size=20)
fig_market.show()

In [26]:
closed_categories = clean_startup_info[clean_startup_info['status']=='closed']['category_list'].str.split('|', expand=True).stack()
fig_closed = make_subplots(rows=1, cols=2, shared_yaxes=True,
                           subplot_titles=("Markets with Most closed Startups", "Markets with Least closed Startups"))
fig_closed.add_trace(go.Bar(x=closed_categories.value_counts()[:10].index,
                            y=closed_categories.value_counts()[:10], name="Market with most closed Startups",
                            marker=dict(color=list(range(20)), colorscale="reds_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=1)
fig_closed.add_trace(go.Bar(x=closed_categories.value_counts()[-10:].index,
                            y=closed_categories.value_counts()[-10:], name="Market with least closed Startups",
                            marker=dict(color=list(range(20)), colorscale="greens_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=2)

fig_closed.update_layout(showlegend=False, paper_bgcolor="mintcream")
fig_closed.show()

In [27]:
acquired_categories = clean_startup_info[clean_startup_info['status']=='acquired']['category_list'].str.split('|', expand=True).stack()
fig_closed = make_subplots(rows=1, cols=2, shared_yaxes=True,
                           subplot_titles=("Markets with Most Acquired Startups", "Markets with Least Acquired Startups"))
fig_closed.add_trace(go.Bar(x=acquired_categories.value_counts()[:10].index,
                            y=acquired_categories.value_counts()[:10], name="Market with most Acquired Startups",
                            marker=dict(color=list(range(20)), colorscale="greens_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=1)
fig_closed.add_trace(go.Bar(x=acquired_categories.value_counts()[-10:].index,
                            y=acquired_categories.value_counts()[-10:], name="Market with least Acquired Startups",
                            marker=dict(color=list(range(20)), colorscale="reds_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=2)

fig_closed.update_layout(showlegend=False, paper_bgcolor="mintcream")
fig_closed.show()

In [28]:
operating_categories = clean_startup_info[clean_startup_info['status']=='operating']['category_list'].str.split('|', expand=True).stack()
fig_closed = make_subplots(rows=1, cols=2, shared_yaxes=True,
                           subplot_titles=("Markets with Most Operating Startups", "Markets with Least Operating Startups"))
fig_closed.add_trace(go.Bar(x=operating_categories.value_counts()[:10].index,
                            y=operating_categories.value_counts()[:10], name="Market with most Operating Startups",
                            marker=dict(color=list(range(20)), colorscale="greens_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=1)
fig_closed.add_trace(go.Bar(x=operating_categories.value_counts()[-10:].index,
                            y=operating_categories.value_counts()[-10:], name="Market with least Operating Startups",
                            marker=dict(color=list(range(20)), colorscale="reds_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=2)

fig_closed.update_layout(showlegend=False, paper_bgcolor="mintcream")
fig_closed.show()

In [29]:
operating_startups = clean_startup_info[clean_startup_info['status']=='operating']
fig_country_map = go.Figure()
fig_country_map.add_trace(go.Choropleth(locations=operating_startups['country_code'].value_counts().index,
                                       z=operating_startups['country_code'].value_counts(),
                                       colorscale='greens',
                                       colorbar_title="Nos. of Startups founded",
                                       ))
fig_country_map.update_layout(title_text="Number of Startups Operating by Country", title_x=0.5, title_font_size=20)
fig_country_map.show()

In [31]:
acquired_startups = clean_startup_info[clean_startup_info['status']=='acquired']
fig_country_map = go.Figure()
fig_country_map.add_trace(go.Choropleth(locations=acquired_startups['country_code'].value_counts().index,
                                       z=acquired_startups['country_code'].value_counts(),
                                       colorscale='blues',
                                       colorbar_title="Nos. of Startups founded",
                                       ))
fig_country_map.update_layout(title_text="Number of Startups Acquired by Country", title_x=0.5, title_font_size=20)
fig_country_map.show()

In [32]:
closed_startups = clean_startup_info[clean_startup_info['status']=='closed']
fig_country_map = go.Figure()
fig_country_map.add_trace(go.Choropleth(locations=closed_startups['country_code'].value_counts().index,
                                       z=closed_startups['country_code'].value_counts(),
                                       colorscale='reds',
                                       colorbar_title="Nos. of Startups founded",
                                       ))
fig_country_map.update_layout(title_text="Number of Startups Closed by Country", title_x=0.5, title_font_size=20)
fig_country_map.show()

In [33]:
fig_funding_amt = px.scatter(clean_startup_info[:1000], x="name", y="funding_rounds", size='funding_total_usd', color='status')

fig_funding_amt.update_layout(
    title='Plot Showing the Funding and Total funding acquired by Startups',
    xaxis_title="Startups",
    yaxis_title="Funding Rounds",
    xaxis_showticklabels=False,
    paper_bgcolor="mintcream",
    title_font_size=20,
    title_x=0.5,
    legend=dict(orientation='h', yanchor='top', y=1.08, xanchor='right', x=1),
    margin=dict(b=100),
    xaxis=dict(title_standoff=45),  # Adjust the distance of x-axis title from the x-axis
    yaxis=dict(title_standoff=20)   # Adjust the distance of y-axis title from the y-axis
)

fig_funding_amt.update_traces(hovertemplate='<br><b>Company</b>: %{x}' + '<br><i>Funding Rounds</i>: %{y}' + '<br><i>Funding(in USD)</i>: %{marker.size}')
fig_funding_amt.show()


In [34]:
fig_status = make_subplots(rows=2, cols=2, specs=[[{"type": "domain", "colspan": 2}, None],[{"type": "domain"}, {"type": "domain"}]],
                          subplot_titles = ("Current status of all Startups", "Status of Startups founded before 2000", 
                                            "Status of Startups founded after 2000"))

fig_status.add_trace(go.Pie(labels=clean_startup_info['status'].value_counts().index,
                            values=clean_startup_info['status'].value_counts()), row=1, col=1)

fig_status.add_trace(go.Pie(labels=clean_startup_info[clean_startup_info['founded_year']<2000]['status'].value_counts().index,
                            values=clean_startup_info[clean_startup_info['founded_year']<2000]['status'].value_counts()), row=2, col=1)

fig_status.add_trace(go.Pie(labels=clean_startup_info[clean_startup_info['founded_year']>=2000]['status'].value_counts().index,
                            values=clean_startup_info[clean_startup_info['founded_year']>=2000]['status'].value_counts()), row=2, col=2)

fig_status.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=11,
                         insidetextorientation='horizontal', rotation=-45)
fig_status.update_layout(height=800, paper_bgcolor="mintcream")
fig_status.show()

In [35]:
# Let's get deeper analyzing the Dataset 3

industry_trends.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923 entries, 0 to 922
Data columns (total 49 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   StartupID                 923 non-null    int64  
 1   state_code                923 non-null    object 
 2   latitude                  923 non-null    float64
 3   longitude                 923 non-null    float64
 4   zip_code                  923 non-null    object 
 5   id                        923 non-null    object 
 6   city                      923 non-null    object 
 7   City_ZIP                  430 non-null    object 
 8   name                      923 non-null    object 
 9   labels                    923 non-null    int64  
 10  founded_at                923 non-null    object 
 11  closed_at                 335 non-null    object 
 12  first_funding_at          923 non-null    object 
 13  last_funding_at           923 non-null    object 
 14  age_first_

In [36]:
industry_trends['founded_year'] = pd.to_datetime(industry_trends['founded_at'], errors = 'coerce').dt.year


In [37]:
industry_trends.describe()
# Oldest startup => 1984

Unnamed: 0,StartupID,latitude,longitude,labels,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,is_NY,is_MA,is_TX,is_otherstate,is_software,is_web,is_mobile,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,founded_year
count,923.0,923.0,923.0,923.0,923.0,923.0,771.0,771.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0
mean,572.297941,38.517442,-103.539212,0.646804,2.23563,3.931456,3.055353,4.754423,7.710726,2.310943,25419750.0,1.84182,0.527627,0.114843,0.089924,0.045504,0.221018,0.165764,0.156013,0.08559,0.07909,0.067172,0.056338,0.027086,0.036836,0.00325,0.32286,0.326111,0.254605,0.508126,0.392199,0.232936,0.099675,2.838586,0.809317,2005.496208
std,333.585431,3.741497,22.394167,0.478222,2.510449,2.96791,2.977057,3.212107,7.265776,1.390922,189634400.0,1.322632,0.499507,0.319005,0.286228,0.208519,0.415158,0.37207,0.363064,0.27991,0.270025,0.250456,0.230698,0.162421,0.188462,0.056949,0.467823,0.469042,0.435875,0.500205,0.488505,0.422931,0.299729,1.874601,0.393052,3.528738
min,1.0,25.752358,-122.756956,0.0,-9.0466,-9.0466,-14.1699,-7.0055,0.0,1.0,11000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1984.0
25%,283.5,37.388869,-122.198732,0.0,0.5767,1.66985,1.0,2.411,3.0,1.0,2725000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,1.0,2003.0
50%,577.0,37.779281,-118.374037,1.0,1.4466,3.5288,2.5205,4.4767,5.0,2.0,10000000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.5,1.0,2006.0
75%,866.5,40.730646,-77.214731,1.0,3.57535,5.56025,4.6863,6.7534,10.0,3.0,24725000.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,3.8,1.0,2008.0
max,1153.0,59.335232,18.057121,1.0,21.8959,21.8959,24.6849,24.6849,63.0,10.0,5700000000.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,16.0,1.0,2013.0


In [38]:
industry_trends.isna().sum()

StartupID                     0
state_code                    0
latitude                      0
longitude                     0
zip_code                      0
id                            0
city                          0
City_ZIP                    493
name                          0
labels                        0
founded_at                    0
closed_at                   588
first_funding_at              0
last_funding_at               0
age_first_funding_year        0
age_last_funding_year         0
age_first_milestone_year    152
age_last_milestone_year     152
relationships                 0
funding_rounds                0
funding_total_usd             0
milestones                    0
state_code.1                  1
is_CA                         0
is_NY                         0
is_MA                         0
is_TX                         0
is_otherstate                 0
category_code                 0
is_software                   0
is_web                        0
is_mobil

In [39]:
# Cleanning dataset
clean_industry_trends = industry_trends.drop(['StartupID', 'City_ZIP'], axis=1)

# Remove the NaN name
clean_industry_trends.dropna(how='any', subset=['name'], axis=0, inplace=True)

#Extracting year value from "first_funding_at" and changing to int
clean_industry_trends['first_funding_at'] = clean_industry_trends.first_funding_at.str.split("/").str[2]
clean_industry_trends['first_funding_at'] = clean_industry_trends['first_funding_at'].astype(float)
#Extracting year value from "last_funding_at" and changing to int
clean_industry_trends['last_funding_at'] = clean_industry_trends.last_funding_at.str.split("/").str[2]
clean_industry_trends['last_funding_at'] = clean_industry_trends['last_funding_at'].astype(float)

In [40]:
# Check false entries
filter_0 = len(clean_industry_trends[clean_industry_trends['first_funding_at']>clean_industry_trends['last_funding_at']])
filter_1 = len(clean_industry_trends[clean_industry_trends['first_funding_at']<1984])
total = filter_0 + filter_1
print(f'There are {total} entries that are wrong by common sense')
print(clean_industry_trends[clean_industry_trends['first_funding_at']<1984][['name', 'founded_at', 'first_funding_at']])
clean_industry_trends.drop(clean_industry_trends[clean_industry_trends['first_funding_at']<1984].index, inplace=True)


There are 0 entries that are wrong by common sense
Empty DataFrame
Columns: [name, founded_at, first_funding_at]
Index: []


In [41]:
startups_history = clean_industry_trends.groupby('founded_year').size().reset_index(name='#Startups')
funding_history = clean_industry_trends.groupby('first_funding_at').size().reset_index(name='#Startups')
# Plot the distribution using Plotly Express with a line plot
fig = px.area(startups_history, x='founded_year', y='#Startups', 
              title='Distribution of Startups by Founded Year and First funding year')

fig.add_trace(go.Scatter(x=startups_history['founded_year'], 
                         y=startups_history['#Startups'], 
                         fill='tozeroy',  # Filling area below the line
                         mode='none', 
                         fillcolor='rgba(255,0,0,0.4)',  # Adjust color as needed
                         name='Founded year')) 

# Add an area chart for the second DataFrame
fig.add_trace(go.Scatter(x=funding_history['first_funding_at'], 
                         y=funding_history['#Startups'], 
                         fill='tozeroy',  # Filling area below the line
                         mode='none', 
                         fillcolor='rgba(0,176,246,0.3)',  # Adjust color as needed
                         name='First funding Year'))  # Legend label

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=10,
                     label="10y",
                     step="year",
                     stepmode="todate"),
                dict(count=20,
                     label="20y",
                     step="year",
                     stepmode="todate"),
                dict(count=50,
                     label="50y",
                     step="year",
                     stepmode="todate"),
                dict(count=100,
                     label="100y",
                     step="year",
                     stepmode="todate"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

fig.show()

In [42]:
fig_country = go.Figure()


# Define a custom greens colorscale with shades of green
colorscale = [[i / 15, f'rgba(0, {int(176 - (i * (176 / 15)))}, 0, {0.5 + (i / 30)})'] for i in range(15)]

# Create the bar plot
fig_country = go.Figure(go.Bar(
    x=clean_industry_trends['state_code'].value_counts().index[:15],
    y=clean_industry_trends['state_code'].value_counts()[:15],
    hovertemplate='<br><b>Country</b>: %{x}'+'<br><i>Startup count</i>: %{y}',
    marker=dict(color=clean_industry_trends['state_code'].value_counts()[:15], colorscale=colorscale)
))


fig_country.update_layout(title="Number of Startups in each State",
                             xaxis_title="Country", yaxis_title="Startup Count",title_x=0.5, paper_bgcolor="silver",
                             title_font_size=20)
fig_country.show()

In [43]:
fig_country_map = go.Figure()
fig_country_map.add_trace(go.Choropleth(locations=clean_industry_trends['state_code'].value_counts().index,
                                       z=clean_industry_trends['state_code'].value_counts(),
                                       locationmode="USA-states",
                                       colorscale='greens',
                                       colorbar_title="Nos. of Startups founded",
                                       ))
fig_country_map.update_layout(title_text="Number of Startups by State in USA", title_x=0.5, title_font_size=20,
                              geo = dict(
                                        scope='usa',
                                        projection=go.layout.geo.Projection(type = 'albers usa'),
                                        showlakes=True, # lakes
                                        lakecolor='rgb(255, 255, 255)'),
                                        paper_bgcolor="mintcream")
fig_country_map.show()

In [44]:
categories_list = clean_industry_trends['category_code']
fig_market = go.Figure()
fig_market.add_trace(go.Bar(x=categories_list.value_counts().index[:30], y=categories_list.value_counts()[:30],
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}',
                           marker=dict(color=list(range(30)), colorscale="Sunsetdark")))

fig_market.update_layout(title="Number of Startups in each Market",
                             xaxis_title="Market", yaxis_title="Startup Count",title_x=0.5, paper_bgcolor="mintcream",
                             title_font_size=20)
fig_market.show()

In [45]:
closed_categories = clean_industry_trends[clean_industry_trends['status']=='closed']['category_code']
fig_closed = make_subplots(rows=1, cols=2, shared_yaxes=True,
                           subplot_titles=("Markets with Most closed Startups", "Markets with Least closed Startups"))
fig_closed.add_trace(go.Bar(x=closed_categories.value_counts()[:10].index,
                            y=closed_categories.value_counts()[:10], name="Market with most closed Startups",
                            marker=dict(color=list(range(20)), colorscale="reds_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=1)
fig_closed.add_trace(go.Bar(x=closed_categories.value_counts()[-10:].index,
                            y=closed_categories.value_counts()[-10:], name="Market with least closed Startups",
                            marker=dict(color=list(range(20)), colorscale="greens_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=2)

fig_closed.update_layout(showlegend=False, paper_bgcolor="mintcream")
fig_closed.show()

In [46]:
closed_categories = clean_industry_trends[clean_industry_trends['status']=='acquired']['category_code']
fig_closed = make_subplots(rows=1, cols=2, shared_yaxes=True,
                           subplot_titles=("Markets with Most Acquired Startups", "Markets with Least Acquired Startups"))
fig_closed.add_trace(go.Bar(x=closed_categories.value_counts()[:10].index,
                            y=closed_categories.value_counts()[:10], name="Market with most Acquired Startups",
                            marker=dict(color=list(range(20)), colorscale="greens_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=1)
fig_closed.add_trace(go.Bar(x=closed_categories.value_counts()[-10:].index,
                            y=closed_categories.value_counts()[-10:], name="Market with least Acquired Startups",
                            marker=dict(color=list(range(20)), colorscale="reds_r"),
                           hovertemplate = '<br><b>Market</b>: %{x}'+'<br><i>Startup count</i>: %{y}'
                           ),
                            row=1, col=2)

fig_closed.update_layout(showlegend=False, paper_bgcolor="mintcream")
fig_closed.show()

In [47]:
closed_usa = clean_industry_trends[clean_industry_trends['status']=='closed']
fig_country_map = go.Figure()
fig_country_map.add_trace(go.Choropleth(locations=closed_usa['state_code'].value_counts().index,
                                       z=closed_usa['state_code'].value_counts(),
                                       locationmode="USA-states",
                                       colorscale='reds',
                                       colorbar_title="Nos. of Startups founded",
                                       ))
fig_country_map.update_layout(title_text="Number of Startups Closed by State in USA", title_x=0.5, title_font_size=20,
                              geo = dict(
                                        scope='usa',
                                        projection=go.layout.geo.Projection(type = 'albers usa'),
                                        showlakes=True, # lakes
                                        lakecolor='rgb(255, 255, 255)'),
                                        paper_bgcolor="mintcream")
fig_country_map.show()

In [48]:
acquired_usa = clean_industry_trends[clean_industry_trends['status']=='acquired']
fig_country_map = go.Figure()
fig_country_map.add_trace(go.Choropleth(locations=acquired_usa['state_code'].value_counts().index,
                                       z=acquired_usa['state_code'].value_counts(),
                                       locationmode="USA-states",
                                       colorscale='greens',
                                       colorbar_title="Nos. of Startups founded",
                                       ))
fig_country_map.update_layout(title_text="Number of Startups Acquired by State in USA", title_x=0.5, title_font_size=20,
                              geo = dict(
                                        scope='usa',
                                        projection=go.layout.geo.Projection(type = 'albers usa'),
                                        showlakes=True, # lakes
                                        lakecolor='rgb(255, 255, 255)'),
                                        paper_bgcolor="mintcream")
fig_country_map.show()

In [49]:
fig_funding_amt = px.scatter(clean_industry_trends[:100], x="name", y="funding_rounds", size='funding_total_usd', color='status')

fig_funding_amt.update_layout(
    title='Plot Showing the Funding and Total funding acquired by Startups',
    xaxis_title="Startups",
    yaxis_title="Funding Rounds",
    xaxis_showticklabels=False,
    paper_bgcolor="mintcream",
    title_font_size=20,
    title_x=0.5,
    legend=dict(orientation='h', yanchor='top', y=1.08, xanchor='right', x=1),
    margin=dict(b=100),
    xaxis=dict(title_standoff=45),  # Adjust the distance of x-axis title from the x-axis
    yaxis=dict(title_standoff=20)   # Adjust the distance of y-axis title from the y-axis
)

fig_funding_amt.update_traces(hovertemplate='<br><b>Company</b>: %{x}' + '<br><i>Funding Rounds</i>: %{y}' + '<br><i>Funding(in USD)</i>: %{marker.size}')
fig_funding_amt.show()

In [50]:
fig_status = make_subplots(rows=2, cols=2, specs=[[{"type": "domain", "colspan": 2}, None],[{"type": "domain"}, {"type": "domain"}]],
                          subplot_titles = ("Current status of all Startups", "Status of Startups founded before 2000", 
                                            "Status of Startups founded after 2000"))

fig_status.add_trace(go.Pie(labels=clean_industry_trends['status'].value_counts().index,
                            values=clean_industry_trends['status'].value_counts()), row=1, col=1)

fig_status.add_trace(go.Pie(labels=clean_industry_trends[clean_industry_trends['founded_year']<2000]['status'].value_counts().index,
                            values=clean_industry_trends[clean_industry_trends['founded_year']<2000]['status'].value_counts()), row=2, col=1)

fig_status.add_trace(go.Pie(labels=clean_industry_trends[clean_industry_trends['founded_year']>=2000]['status'].value_counts().index,
                            values=clean_industry_trends[clean_industry_trends['founded_year']>=2000]['status'].value_counts()), row=2, col=2)

fig_status.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=11,
                         insidetextorientation='horizontal', rotation=-45)
fig_status.update_layout(height=800, paper_bgcolor="mintcream")
fig_status.show()