In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import squarify
from sklearn.linear_model import LinearRegression
import numpy as np

df = pd.read_csv("startups.csv", encoding='latin-1')
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')
df['Valuation'] = df['Valuation ($B)'].str.replace('$', '', regex=False).astype(float)
df['Date Joined'] = pd.to_datetime(df['Date Joined'], errors='coerce')
df['Year Joined'] = df['Date Joined'].dt.year
df['City'].fillna('Unknown', inplace=True)
df['Select Investors'].fillna('Unknown', inplace=True)
df['Investors'] = df['Select Investors'].apply(lambda x: [i.strip() for i in x.split(',')])

sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
df['Year Joined'].value_counts().sort_index().plot(kind='line', marker='o')
plt.title("Unicorns Joined per Year")
plt.xlabel("Year")
plt.ylabel("Number of Startups")
plt.show()

df['Country'].value_counts().head(10).plot(kind='barh', color='skyblue')
plt.title("Top 10 Countries by Unicorn Count")
plt.xlabel("Number of Startups")
plt.gca().invert_yaxis()
plt.show()

sns.histplot(df['Valuation'], bins=30, kde=True, color='orange')
plt.title("Distribution of Valuations ($B)")
plt.show()

investor_text = " ".join([inv for sublist in df['Investors'] for inv in sublist])
wordcloud = WordCloud(width=1000, height=600, background_color='white').generate(investor_text)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Investor Word Cloud")
plt.show()

industry_counts = df['Industry'].value_counts()
plt.figure(figsize=(12, 8))
squarify.plot(sizes=industry_counts.values, label=industry_counts.index, alpha=0.8)
plt.title("Treemap of Industries")
plt.axis('off')
plt.show()

country_counts = df['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Count']
fig = px.choropleth(country_counts, locations='Country', locationmode='country names',
                    color='Count', hover_name='Country', color_continuous_scale='Blues')
fig.update_layout(title="Global Distribution of Unicorn Startups")
fig.show()

X = yearly = df['Year Joined'].value_counts().sort_index().index.values.reshape(-1, 1)
y = df['Year Joined'].value_counts().sort_index().values.reshape(-1, 1)

model = LinearRegression().fit(X, y)
future_years = np.array(range(X.min(), X.max()+5)).reshape(-1, 1)
predicted = model.predict(future_years)

plt.plot(X, y, marker='o', label='Actual')
plt.plot(future_years, predicted, linestyle='--', color='red', label='Forecast')
plt.title("Forecast of Unicorn Growth")
plt.xlabel("Year")
plt.ylabel("Number of Startups")
plt.legend()
plt.show()

