In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")
 
# Load the file you uploaded
df = pd.read_csv("/mnt/data/Startups.csv")
df.head()

# Check column names
df.columns

# Rename for ease
df.rename(columns={
    'Date Joined': 'Date',
    'Valuation($B)': 'Valuation',
    'Select Investors': 'Investors'
}, inplace=True)

# Convert date
df['Date'] = pd.to_datetime(df['Date'])

# Clean valuation
df['Valuation'] = df['Valuation'].replace('[\$,B]', '', regex=True).astype(float)

print("Rows & Columns:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nTop Countries:\n", df['Country'].value_counts().head())
print("\nTop Industries:\n", df['Industry'].value_counts().head())


top_countries = df['Country'].value_counts().head(10)
sns.set(style="whitegrid")
plt.figure(figsize=(10,6))
sns.barplot(x=top_countries.values, y=top_countries.index)
plt.title("Top 10 Countries by Unicorn Count")
plt.xlabel("Number of Unicorns")

df['Year'] = df['Date'].dt.year
yearly = df['Year'].value_counts().sort_index()

plt.figure(figsize=(12,5))
sns.lineplot(x=yearly.index, y=yearly.values, marker='o')
plt.title("Unicorn Creation Over Years")
plt.ylabel("Number of Startups")

fig = px.scatter(df,
                 x="Year", y="Valuation",
                 size="Valuation", color="Country",
                 hover_name="Company",
                 title="Unicorn Valuations by Year and Country")
fig.show()



df['Num_Investors'] = df['Investors'].apply(len)
le_country = LabelEncoder()
df['Country_encoded'] = le_country.fit_transform(df['Country'])
le_industry = LabelEncoder()
df['Industry_encoded'] = le_industry.fit_transform(df['Industry'])

features = df[['Country_encoded', 'Industry_encoded', 'Num_Investors', 'Year']]
target = df['Valuation']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Valuation Prediction MSE:", mean_squared_error(y_test, y_pred))

yearly_trend = df['Year Joined'].value_counts().sort_index()
sns.lineplot(x=yearly_trend.index, y=yearly_trend.values, marker='o')
plt.title('Unicorns Joined per Year')
plt.xlabel('Year')
plt.ylabel('Number of Unicorns')
plt.tight_layout()
plt.show()

all_investors = [investor for sublist in df['Investors'] for investor in sublist]
investor_freq = pd.Series(all_investors).value_counts()
wordcloud = WordCloud(width=1000, height=600, background_color='white').generate_from_frequencies(investor_freq)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Investors')
plt.tight_layout()
plt.show()






In [None]:
# Required Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Dataset
df = pd.read_csv("Startups.csv", encoding='latin-1')

# Data Cleaning
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')  # Optional cleanup
df['Valuation'] = df['Valuation ($B)'].str.replace('$', '', regex=False).astype(float)
df.drop(columns=['Valuation ($B)'], inplace=True)
df['Date Joined'] = pd.to_datetime(df['Date Joined'], errors='coerce')
df['Year Joined'] = df['Date Joined'].dt.year

# Plot Style
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))

# Create Line Chart
yearly_counts = df['Year Joined'].value_counts().sort_index()
sns.lineplot(x=yearly_counts.index, y=yearly_counts.values, marker='o', color='blue')

# Add Labels
plt.title('📈 Unicorn Startups Joined Each Year', fontsize=14)
plt.xlabel('Year')
plt.ylabel('Number of Startups')
plt.xticks(rotation=45)
plt.tight_layout()

# Show the Plot
plt.show()


In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud

# Load Data
df = pd.read_csv('Startups.csv', encoding='latin-1')

# Clean Data
df.drop(columns=['Unnamed: 0'], inplace=True)
df['Valuation'] = df['Valuation ($B)'].str.replace('$', '', regex=False).astype(float)
df.drop(columns=['Valuation ($B)'], inplace=True)
df['Date Joined'] = pd.to_datetime(df['Date Joined'], errors='coerce')
df['City'].fillna('Unknown', inplace=True)
df['Select Investors'].fillna('Unknown', inplace=True)
df['Investors'] = df['Select Investors'].apply(lambda x: [i.strip() for i in x.split(',')] if isinstance(x, str) else [])
df['Year Joined'] = df['Date Joined'].dt.year

# Set Style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# 1. Top Countries by Unicorn Count
top_countries = df['Country'].value_counts().head(10)
sns.barplot(x=top_countries.values, y=top_countries.index, palette='viridis')
plt.title('Top Countries by Unicorn Count')
plt.xlabel('Number of Unicorns')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

# 2. Valuation Distribution
sns.histplot(df['Valuation'], bins=30, kde=True, color='orange')
plt.title('Distribution of Unicorn Valuations (in $B)')
plt.xlabel('Valuation ($B)')
plt.tight_layout()
plt.show()

# 3. Industry-wise Unicorn Count
industry_count = df['Industry'].value_counts().head(10)
sns.barplot(x=industry_count.values, y=industry_count.index, palette='cubehelix')
plt.title('Top 10 Industries by Unicorn Count')
plt.xlabel('Number of Unicorns')
plt.ylabel('Industry')
plt.tight_layout()
plt.show()

# 4. Trend of Unicorns by Year
yearly_trend = df['Year Joined'].value_counts().sort_index()
sns.lineplot(x=yearly_trend.index, y=yearly_trend.values, marker='o')
plt.title('Unicorns Joined per Year')
plt.xlabel('Year')
plt.ylabel('Number of Unicorns')
plt.tight_layout()
plt.show()

# 5. Word Cloud of Investors
all_investors = [investor for sublist in df['Investors'] for investor in sublist]
investor_freq = pd.Series(all_investors).value_counts()
wordcloud = WordCloud(width=1000, height=600, background_color='white').generate_from_frequencies(investor_freq)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Investors')
plt.tight_layout()
plt.show()

# 6. Choropleth Map of Unicorns by Country
map_data = df['Country'].value_counts().reset_index()
map_data.columns = ['Country', 'Count']
fig_map = px.choropleth(map_data, locations="Country",
                        locationmode="country names",
                        color="Count",
                        hover_name="Country",
                        color_continuous_scale=px.colors.sequential.Plasma,
                        title="Global Distribution of Unicorn Startups")
fig_map.show()
