In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mode
from sklearn.impute import SimpleImputer


In [None]:
# Load the data for each year
df_2018 = pd.read_csv('../Edinburgh-Indian-Start-up-Project\Clean_Data\clean-startup-2018.csv')
df_2019 = pd.read_csv('../Edinburgh-Indian-Start-up-Project\Clean_Data\clean-startup-2019.csv')
df_2020 = pd.read_csv('../Edinburgh-Indian-Start-up-Project\Clean_Data\clean-startup-2020.csv')
df_2021 = pd.read_csv('../Edinburgh-Indian-Start-up-Project\Clean_Data\clean-startup-2021.csv')

In [None]:
# Concatenate all years into a single dataframe
df = pd.concat([df_2018, df_2019, df_2020, df_2021])

In [None]:
df

In [None]:
#converting Amount column to datatype float
df["Amount($)"].astype("float")

In [None]:
#describe the data 
df.describe()


In [None]:
df.info()

In [None]:

# Check for missing values
print(df.isnull().sum())

In [None]:
#filling missing categorical data on founders using imputer methord

# Create an instance of SimpleImputer with the 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputer on founders column
imputer.fit(df[['Founders']])

# Transform the data 
df['Founders'] = imputer.transform(df[['Founders']])


In [None]:
#filling missing categorical data on Investor using imputer methord

# Create an instance of SimpleImputer with the 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputer on Investor column
imputer.fit(df[['Investor']])

# Transform the data 
df['Investor'] = imputer.transform(df[['Investor']])

In [None]:
print(df.isnull().sum())

# Univariate analysis
###  It only analyzes one variable, which is the 'Amount($)' column

In [None]:
# Plot the distribution of funding amounts
df = df.reset_index(drop=True)
sns.displot(data=df, x='Amount($)', bins=50)
plt.title('Distribution of Funding Amounts')
plt.show()


 The histogram is divided into 50 bins, which gives an idea of how the funding amounts are distributed across different ranges

 # Univariate analysis 
 ### As it only analyzes the distribution of startups across different sectors.

In [None]:
# Get the value counts for each sector and sort them in descending order
sector_counts = df['Sector'].value_counts().sort_values(ascending=False)

# Plot the countplot with sorted bars
sns.countplot(data=df, y='Sector', order=sector_counts.index)
plt.title('Number of Startups by Sector')
plt.show()


This visualization helps to identify which sectors have the highest number of startups and compare their relative popularity.

# Univariate analysis of
### Funding amount by sector

In [None]:
# Sort the sectors by funding amount in descending order
sector_order = df.groupby('Sector')['Amount($)'].sum().sort_values(ascending=False).index

# Plot the funding amounts by sector in descending order
sns.barplot(data=df, y='Sector', x='Amount($)', estimator=np.sum, ci=None, order=sector_order)
plt.title('Funding Amounts by Sector')
plt.show()

The above chart  shows the sector with the highest total funding amount is Automotive, followed by Technology and Logistics.
    Consumables and Education sectors also received significant funding amounts, while other sectors received comparatively lower funding amounts.

 # Univariate analysis.
 ### It is analyzing the frequency distribution of startups by funding stage.

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=df, y='Stage', order=df['Stage'].value_counts().index[:20])
plt.title('Number of Startups by Funding Stage (Top 20)')
plt.xlabel('Number of Startups')
plt.ylabel('Funding Stage')
plt.show()


In the above chart The finding is that the majority of startups in the dataset are in the Seed and Series A funding stages, with a sharp decline in the number of startups in later funding stages such as Series E and beyond.

# Univariate analysis.
 ### showing the total funding amount for each funding stage. The y-axis represents the funding stage and the x-axis represents the total funding amount.

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=df, y='Stage', x='Amount($)', estimator=np.sum, ci=None, order=df.groupby('Stage')['Amount($)'].sum().sort_values(ascending=False).index[:20])
plt.title('Funding Amounts by Funding Stage')
plt.xlabel('Amount ($)')
plt.ylabel('Funding Stage')
plt.show()




The findings from the above chart suggest that Seed Funding is the most common funding stage for Indian startups, followed by Series A and Series B. The total funding amount gradually increases as the funding stage advances from Seed to Series F

 # Univariate  analysis
 ### As it is analyzing the number of startups in each location without considering any other variable.


In [None]:
plt.figure(figsize=(8, 10))
sns.countplot(data=df, y='HeadQuarter', order=df['HeadQuarter'].value_counts().iloc[:20].index)
plt.title('Number of Startups by Location (Top 20)')
plt.xlabel('Number of Startups')
plt.ylabel('Location')
plt.show()



Bangalore is the top location for startups with more than 3000 startups in the dataset.
Mumbai and Delhi are the next two most popular locations with more than 1500 startups each.
Other popular locations for startups include Hyderabad, Chennai, Pune, and Gurgaon.

# Univariate analysis of
### funding by top 20 headquarter  locations

In [None]:
funding_amount = df['HeadQuarter']
print(funding_amount)


In [None]:
distinct_stages = df['HeadQuarter'].unique()
print(distinct_stages)


In [None]:
top20_locations = df.groupby('HeadQuarter')['Amount($)'].sum().sort_values(ascending=False).head(20).index.tolist()

sns.barplot(data=df, y='HeadQuarter', x='Amount($)', estimator=np.sum, ci=None, order=top20_locations)
plt.title('Funding Amounts by Location (Top 20)')
plt.show()




There is a significant difference between the total funding received by the top location and the other locations on the above chart indicating that certain locations may receive more funding than others. The bar chart provides a quick and easy way to visually compare the total funding received by the top 20 headquarter locations.

In [None]:
distinct_stages = df['Stage'].unique()
print(distinct_stages)

# Univariate analysis of
### Funding amount by funding stage
 

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=df, y='Stage', x='Amount($)', estimator=np.sum, ci=None, order=df.groupby('Stage')['Amount($)'].sum().sort_values(ascending=False).index[:20])
plt.title('Funding Amounts by Funding Stage')
plt.xlabel('Amount ($)')
plt.ylabel('Funding Stage')
plt.show()

In the above chart the funding stage that received the most funding is "Seed". The funding amounts decrease as the stage of development advances, with "Series A" receiving the second highest amount of funding, followed by "Series B". The funding amounts for each stage after "Series B" continue to decrease

# Bivariate Analysis 
### It plots the correlation between two variables - Stage of Development and Funding Amount.

In [None]:
# Create scatter plot
plt.scatter('Stage', 'Amount($)')

# Set axis labels and title
plt.xlabel('Stage of Development')
plt.ylabel('Funding Amount (USD)')
plt.title('Correlation Between Stage of Development and Funding Amount')

# Show the plot
plt.show()


 # Bivariate Analysis
### it is exploring the relationship between two variables: Stage of Development and Funding Amount, while also considering the third variable of Year through color-coding.

In [None]:
# Filter top 10 stages by funding amount
top10_stages = df.groupby('Stage')['Amount($)'].sum().sort_values(ascending=False).head(10).index.tolist()
df_filtered = df[df['Stage'].isin(top10_stages)]

# Create scatter plot
plt.figure(figsize=(10,8))
sns.scatterplot(data=df_filtered, x='Stage', y='Amount($)', hue='Year')
plt.title('Correlation Between Stage of Development and Funding Amount for Indian Startups')
plt.xlabel('Stage of Development')
plt.ylabel('Funding Amount')
plt.show()

The scatter plot above shows that the funding amount generally increases as the stage of development progresses, with later stages such as Growth and Expansion receiving the highest amount of funding. Additionally, there appears to be an increasing trend in funding amount over the years, with higher amounts being received in more recent years.

# Univariate Analysis of
### Total funding by sector

In [None]:

# group the data by sector and calculate the sum of funding
funding_by_sector = df.groupby('Sector')['Amount($)'].sum().reset_index()

# sort the data by total funding in descending order
funding_by_sector = funding_by_sector.sort_values(by='Amount($)', ascending=False)

# plot the bar chart
plt.figure(figsize=(12, 6))
plt.bar(funding_by_sector['Sector'], funding_by_sector['Amount($)'])
plt.xticks(rotation=45)
plt.xlabel('Sector')
plt.ylabel('Total Funding (in millions)')
plt.title('Total Funding by Sector')
plt.show()




On the above plot we can see that Automotive sector receives the highest funding whereas Marketing sector receives the least funding

# Univariate analysis of
### Pattern in location of top 10 companys headquarters and the amount of funding they receive

In [None]:

# group the data by headquarter location and calculate the mean funding
funding_by_location = df.groupby('HeadQuarter')['Amount($)'].mean().reset_index()

# sort the data by mean funding in descending order
funding_by_location = funding_by_location.sort_values(by='Amount($)', ascending=False)
# select the top 10 locations
top_10_locations = funding_by_location[:10]
# plot the bar chart
plt.figure(figsize=(12, 6))
plt.bar(top_10_locations['HeadQuarter'], top_10_locations['Amount($)'])
plt.xticks(rotation=90)
plt.xlabel('Headquarter Location')
plt.ylabel('Mean Funding (in millions)')
plt.title('patterns in the location of top 10 companys headquarters and the amount of funding they receive')
plt.show()


From the above chart, the analysis of funding by headquarter location, we can see that some locations such as Faridabad, Shangai,China, Kalpakkam, and Beijing have received significantly higher amounts of funding compared to other locations. This could indicate a pattern where companies based in major business centers or startup hubs tend to receive more funding compared to those based in other locations

# Multivariate analysis
### To visualize the relationship between founding year, location (headquarter), and sector

In [None]:

# Create a pivot table with founding year, location, and sector as indices
pivot = df.pivot_table(values='Amount($)', index=['Year', 'HeadQuarter'], columns='Sector', aggfunc='sum')

# Create the heatmap
plt.figure(figsize=(12,8))
sns.heatmap(pivot, cmap='Blue')
plt.title('Relationship between Founding Year, Location, and Sector')
plt.xlabel('Sector')
plt.ylabel('Founding Year - Location')
plt.show()


 On the above heatmap the color scale indicates the magnitude of the funding amount, with darker shades representing higher values.

Some possible findings from this heatmap are:

    The highest funding amounts are concentrated in a few sectors, such as Technology, Healthcare, and E-commerce, and are mainly located in a few cities like Bangalore, Mumbai, and Delhi.
    Funding amounts are generally increasing over time, with the most recent years (2020-2022) showing the highest levels of funding across most sectors and locations.
    There are some notable regional variations in funding amounts, with certain cities or regions showing higher levels of funding in specific sectors. For example, Delhi and Gurgaon have higher funding amounts in the Real Estate sector, while Bangalore and Mumbai have higher funding amounts in the Technology sector.

# Multivariate analysis 
### To visualize the correlation between funding stage, funding amount, and sector using a scatter plot:

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(data=df, x='Stage', y='Amount($)', hue='Sector', alpha=0.5)
plt.title('Correlation between Stage, Funding Amount, and Sector')
plt.xlabel('Funding Stage')
plt.ylabel('Amount($)')
plt.show()


The scatter plot with funding stage on the x-axis, funding amount on the y-axis, and different sectors represented by different colors. The alpha parameter is set to 0.5 to make the plot points partially transparent.