# Import all the Packages you will need 

In [121]:
import pyodbc
from dotenv import dotenv_values
import pandas as pd 
import numpy as np
from scipy import stats
import statistics as stat
import warnings
import os

warnings.filterwarnings('ignore')

# Load the Environments and get the variables form your.env file 

In [122]:
# Load environment variables from .env file into a dictionary
environment_variables = dotenv_values('.env')


# Get the values for the credentials you set in the '.env' file
server = environment_variables.get("SERVER")
database = environment_variables.get("DATABASE")
username = environment_variables.get("USERNAME")
password = environment_variables.get("PASSWORD")

In [123]:
conn_str =f"DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}"

# Make the Connection and Read the Tables

In [124]:
connection = pyodbc.connect(conn_str)

query1 = "Select * from LP1_startup_funding2020"
data2020 = pd.read_sql(query1, connection)

query2 = "Select * from LP1_startup_funding2021"
data2021 = pd.read_sql(query2, connection)

# Read the csv 

In [125]:
df2018 = pd.read_csv('startup_funding2018.csv')

df2019 = pd.read_csv('startup_funding2019.csv')

# Drop Colunms that are not going to be needed in the project and also the Empty Colunms 

In [126]:

df2019.drop(['Founded', 'Founders', 'Investor'], axis=1, inplace=True)
data2020.drop(['Founded', 'Founders', 'Investor'], axis=1, inplace=True)
data2021.drop(['Founded', 'Founders', 'Investor'], axis=1, inplace=True)


In [127]:
data2020.drop('column10', axis=1, inplace=True)


# Remove the dollar sign from the 'Amount($)' column

In [128]:
df2019['Amount($)'] = df2019['Amount($)'].str.replace('$', '')


# Add a Year Colunm 

In [129]:
df2018['Year']='2018'
df2019['Year']='2019'
data2020['Year']='2020'
data2021['Year']='2021'

In [130]:
df2019.head()

Unnamed: 0,Company/Brand,HeadQuarter,Sector,What it does,Amount($),Stage,Year
0,Bombay Shaving,,Ecommerce,Provides a range of male grooming products,6300000,,2019
1,Ruangguru,Mumbai,Edtech,A learning platform that provides topic-based ...,150000000,Series C,2019
2,Eduisfun,Mumbai,Edtech,It aims to make learning fun via games.,28000000,Fresh funding,2019
3,HomeLane,Chennai,Interior design,Provides interior designing solutions,30000000,Series D,2019
4,Nu Genes,Telangana,AgriTech,"It is a seed company engaged in production, pr...",6000000,,2019


In [131]:
df2018 = df2018[['Company Name', 'Industry', 'Round/Series', 'Amount', 'Location', 'About Company', 'Year']]
df2019 = df2019[['Company/Brand', 'Sector',  'Stage', 'Amount($)', 'HeadQuarter', 'What it does', 'Year']]
data2020 = data2020[['Company_Brand', 'Sector', 'Stage', 'Amount', 'HeadQuarter', 'What_it_does', 'Year']]
data2021 = data2021[['Company_Brand', 'Sector', 'Stage', 'Amount', 'HeadQuarter', 'What_it_does', 'Year']]



In [132]:
data2020.head()

Unnamed: 0,Company_Brand,Sector,Stage,Amount,HeadQuarter,What_it_does,Year
0,Aqgromalin,AgriTech,,200000.0,Chennai,Cultivating Ideas for Profit,2020
1,Krayonnz,EdTech,Pre-seed,100000.0,Bangalore,An academy-guardian-scholar centric ecosystem ...,2020
2,PadCare Labs,Hygiene management,Pre-seed,,Pune,Converting bio-hazardous waste to harmless waste,2020
3,NCOME,Escrow,,400000.0,New Delhi,Escrow-as-a-service platform,2020
4,Gramophone,AgriTech,,340000.0,Indore,Gramophone is an AgTech platform enabling acce...,2020


In [133]:
df2018.columns=[['Company_Brand', 'Sector', 'Round/Series', 'Amount', 'HeadQuarter', 'BIO', 'Year']]
df2019.columns=[['Company_Brand', 'Sector',  'Round/Series', 'Amount', 'HeadQuarter', 'BIO', 'Year']]
data2020.columns=[['Company_Brand', 'Sector', 'Round/Series', 'Amount', 'HeadQuarter', 'BIO', 'Year']]
data2021.columns=[['Company_Brand', 'Sector', 'Round/Series', 'Amount', 'HeadQuarter', 'BIO', 'Year']]

In [138]:
data2020.isna().sum()

Company_Brand      0
Sector            13
Round/Series     464
Amount           254
HeadQuarter       94
BIO                0
Year               0
dtype: int64

In [139]:
data2021.isna().sum()

Company_Brand      0
Sector             0
Round/Series     428
Amount             3
HeadQuarter        1
BIO                0
Year               0
dtype: int64

In [140]:
df2018.isna().sum()

Company_Brand    0
Sector           0
Round/Series     0
Amount           0
HeadQuarter      0
BIO              0
Year             0
dtype: int64

In [141]:
df2019.isna().sum()

Company_Brand     0
Sector            5
Round/Series     46
Amount            0
HeadQuarter      19
BIO               0
Year              0
dtype: int64

# 1st Hypothesis

Hypothesis: The headquarters location of a startup in India (e.g., Chennai, Bangalore, Pune, New Delhi, Indore) has a significant impact on the funding amount it receives.

Explanation: You can hypothesize that the location of a startup's headquarters may influence its access to funding. For example, you might expect that startups in major startup hubs like Bangalore receive larger amounts of funding compared to those in smaller cities. You can investigate this hypothesis by performing statistical analysis, such as:

ANOVA or Kruskal-Wallis Test: To determine if there are significant differences in funding amounts between startups in different cities.
Regression Analysis: To model the relationship between the headquarters location and funding amounts while controlling for other factors like sector and stage.

# 2nd Hypothesis

Hypothesis: Startups in the AgriTech sector in India receive higher average funding amounts compared to startups in other sectors.

Explanation: You can hypothesize that the AgriTech sector, given its potential for innovation and growth in India, attracts more significant funding. To test this hypothesis, you can:

Calculate the average funding amounts for startups in different sectors.
Perform a statistical test, such as a t-test or ANOVA, to determine if there is a significant difference in funding amounts between the AgriTech sector and other sectors.

# 3rd Hypothesis

Hypothesis: The total funding received by Indian startups has been increasing over the years (from 2018 to 2021).

Explanation: This hypothesis focuses on the trend in total funding over time. To test this hypothesis, you can:

Calculate the total funding received by all startups for each year (2018, 2019, 2020, 2021).
Create a time series plot or line chart to visualize the funding trend over the four years.