In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Unicorn_Companies.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [42]:
df

Unnamed: 0,Company,Valuation in billions (B) of dollars,Date Joined,Industry,City,Country,Continent,Year Founded,Funding in dollars,Select Investors
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,8000000000,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,7000000000,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,2000000000,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,95,2014-01-23,Fintech,San Francisco,United States,North America,2010,2000000000,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,4000000000,"Institutional Venture Partners, Sequoia Capita..."
5,Canva,40,2018-01-08,Internet software & services,Surry Hills,Australia,Oceania,2012,572000000,"Sequoia Capital China, Blackbird Ventures, Mat..."
6,Checkout.com,40,2019-05-02,Fintech,London,United Kingdom,Europe,2012,2000000000,"Tiger Global Management, Insight Partners, DST..."
7,Instacart,39,2014-12-30,"Supply chain, logistics, & delivery",San Francisco,United States,North America,2012,3000000000,"Khosla Ventures, Kleiner Perkins Caufield & By..."
8,JUUL Labs,38,2017-12-20,Consumer & retail,San Francisco,United States,North America,2015,14000000000,Tiger Global Management
9,Databricks,38,2019-02-05,Data management & analytics,San Francisco,United States,North America,2013,3000000000,"Andreessen Horowitz, New Enterprise Associates..."


# Data Cleaning

In [12]:
mask = df.isna() | (df == 'Unknown')
null_counts = mask.any(axis=1)
df[null_counts]

Unnamed: 0,Company,Valuation in billions (B) of dollars,Date Joined,Industry,City,Country,Continent,Year Founded,Funding,Select Investors
12,FTX,32,2021-07-20,Fintech,,Bahamas,North America,2018,$2B,"Sequoia Capital, Thoma Bravo, Softbank"
170,HyalRoute,4,2020-05-26,Mobile & telecommunications,,Singapore,Asia,2015,$263M,Kuang-Chi
242,Moglix,3,2021-05-17,E-commerce & direct-to-consumer,,Singapore,Asia,2015,$471M,"Jungle Ventures, Accel, Venture Highway"
251,Trax,3,2019-07-22,Artificial intelligence,,Singapore,Asia,2010,$1B,"Hopu Investment Management, Boyu Capital, DC T..."
325,Amber Group,3,2021-06-21,Fintech,,Hong Kong,Asia,2015,$328M,"Tiger Global Management, Tiger Brokers, DCM Ve..."
382,Ninja Van,2,2021-09-27,"Supply chain, logistics, & delivery",,Singapore,Asia,2014,$975M,"B Capital Group, Monk's Hill Ventures, Dynamic..."
541,Advance Intelligence Group,2,2021-09-23,Artificial intelligence,,Singapore,Asia,2016,$536M,"Vision Plus Capital, GSR Ventures, ZhenFund"
629,LinkSure Network,1,2015-01-01,Mobile & telecommunications,Shanghai,China,Asia,2013,$52M,
811,Carousell,1,2021-09-15,E-commerce & direct-to-consumer,,Singapore,Asia,2012,$288M,"500 Global, Rakuten Ventures, Golden Gate Vent..."
848,Matrixport,1,2021-06-01,Fintech,,Singapore,Asia,2019,$100M,"Dragonfly Captial, Qiming Venture Partners, DS..."


#  1. Cleaning Valuation column

In [6]:
df.rename(columns = {'Valuation': 'Valuation in billions (B) of dollars'}, inplace= True)

In [7]:
df['Valuation in billions (B) of dollars'] = df['Valuation in billions (B) of dollars'].str.replace('$', '', regex=False).str.replace('B', '', regex=False)


In [8]:
df['Valuation in billions (B) of dollars'].dtype

dtype('O')

In [9]:
df['Valuation in billions (B) of dollars'] = df['Valuation in billions (B) of dollars'].astype('int64')


#  2.  Cleaning Industry column

In [10]:
df['Industry'] = df['Industry'].replace('Artificial Intelligence', 'Artificial intelligence')

# 3. Cleaning Funding column

In [11]:
df['Funding'] = df['Funding'].replace('Unknown', np.nan)
df = df.dropna(subset=['Funding'])

In [20]:
df = df.reset_index(drop=True)

In [22]:
df['Funding_unit'] = df['Funding'].str[-1]

In [23]:
df['Funding'] = df['Funding'].str.replace('$', '', regex=False).str.replace('B', '', regex=False).str.replace('M', '', regex=False)

In [24]:
df['Funding'].dtype

dtype('O')

In [26]:
df['Funding'] = df['Funding'].astype('int64')

In [43]:
def clean_funding(x):
    if x['Funding_unit'] == 'B':
        return int(float(x['Funding']) * 1000000000)
    elif x['Funding_unit'] == 'M':
        return int(float(x['Funding']) * 1000000)
    else:
        return None

SyntaxError: invalid syntax (559941700.py, line 7)

In [29]:
df['Funding'] = df.apply(clean_funding, axis=1)


In [41]:
df = df.drop('Funding_unit', axis=1)

In [31]:
df.rename(columns = {'Funding': 'Funding in  dollars'}, inplace= True)

# Industry Analysis:
1. Which industries have the highest Total and Average company valuation?
2. Which industries have the most companies that have reached a $1 billion valuation?
3. Which industries have the most investment funding?






# 1. Which industries have the highest Total and Average company valuation?

In [None]:
industry_valuations_sum = df.groupby('Industry')['Valuation in billions (B) of dollars'].sum()

industry_valuations_sum  = industry_valuations_sum.sort_values(ascending=False)
industry_valuations_sum 

In [None]:
plt.figure(figsize=(12,6))

# Plot the bar chart
ax = sns.barplot(x=industry_valuations_sum.index, y=industry_valuations_sum.values, color='b')

# Add values above each bar
for i, v in enumerate(industry_valuations_sum.values):
    ax.text(i, v+1, "{:.1f}".format(v), ha='center', fontsize=10)

plt.title('Company Valuation by Industry: Sum of Valuations', fontsize=16)
plt.xlabel('Industry', fontsize=12)
plt.ylabel('Valuation (billions of dollars)', fontsize=12)

plt.xticks(rotation=90)

plt.savefig('industry_valuations_sum.png', dpi=300)

plt.show()

In [None]:
industry_valuations_mean = df.groupby('Industry')['Valuation in billions (B) of dollars'].mean()

industry_valuations_mean = industry_valuations_mean.sort_values(ascending=False)
industry_valuations_mean

In [None]:
plt.figure(figsize=(12,6))

# Plot the bar chart
ax = sns.barplot(x=industry_valuations_mean.index, y=industry_valuations_mean.values, color='b')

# Add values above each bar
for i, v in enumerate(industry_valuations_mean.values):
    ax.text(i, v+0.1, "{:.1f}".format(v), ha='center', fontsize=10)


plt.title('Average Industry Valuations', fontsize=16)
plt.xlabel('Industry', fontsize=12)
plt.ylabel('Valuation (billions of dollars)', fontsize=12)

plt.xticks(rotation=90)

plt.savefig('industry_valuations_mean.png', dpi=300)

plt.show()