<a href="https://colab.research.google.com/github/HillZhang2004/Forked_Hill_Spring20250219_GITWorkshop/blob/main/CS46_Project_Startup_Search_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning

In [None]:
import pandas as pd

df = pd.read_csv('startup_funding2021.csv')
print(df.head())
df.columns = [
    'Company/Brand',
    'Founded',
    'HeadQuarter',
    'Sector',
    'What it does',
    'Founders',
    'Investor',
    'Amount($)',
    'Stage'
]
df['Amount($)'] = df['Amount($)'].str.replace('[\$,]', '', regex=True)
df['Amount($)'] = pd.to_numeric(df['Amount($)'], errors='coerce')
df = df.dropna(subset=['Amount($)', 'Company/Brand', 'Sector'])
df.to_csv('startup_funding2021_cleaned.csv', index=False)



    Company/Brand  Founded HeadQuarter          Sector  \
0  Unbox Robotics   2019.0   Bangalore      AI startup   
1          upGrad   2015.0      Mumbai          EdTech   
2     Lead School   2012.0      Mumbai          EdTech   
3         Bizongo   2015.0      Mumbai  B2B E-commerce   
4        FypMoney   2021.0    Gurugram         FinTech   

                                        What it does  \
0  Unbox Robotics builds on-demand AI-driven ware...   
1     UpGrad is an online higher education platform.   
2  LEAD School offers technology based school tra...   
3  Bizongo is a business-to-business online marke...   
4  FypMoney is Digital NEO Bank for Teenagers, em...   

                                            Founders  \
0                        Pramod Ghadge, Shahid Memon   
1  Mayank Kumar, Phalgun Kompalli, Ravijot Chugh,...   
2                         Smita Deorah, Sumeet Mehta   
3            Aniket Deb, Ankit Tomar, Sachin Agrawal   
4                                 

# Code

In [None]:
# *** Startup Search Engine Project ***

# Team Members: Kazuma, Hill, Haven, Evelyn

# This project builds an interactive startup funding explorer based on
# 2021 Crunchbase Startup Funding data.

# Users can:
# - Search startups by sector, keyword, or investor (partial match supported)
# - View top funded startups
# - View top sectors by number of startups (with table and bar chart)
# - Explore startup connections (DFS traversal based on shared investors)
# - See top investors ranked by number of startups funded
# - Save top funded startups into a CSV file

# Data Structures Used:
# - Hash tables (sector and investor lookup), Graphs (investor–startup relationships)
# - Stack (for DFS traversal), Lists, Dictionaries, Counters

# Libraries Used (learned from CS36 Class):
# pandas, collections, matplotlib, tabulate

import pandas as pd
from collections import Counter
from tabulate import tabulate
import matplotlib.pyplot as plt

# Load cleaned data
df = pd.read_csv('startup_funding2021_cleaned.csv')

# Build Sector Hash Table
sector_hash = {}
for _, row in df.iterrows():
    sector = str(row['Sector']).strip().lower()
    name = row['Company/Brand']
    if pd.notna(sector) and pd.notna(name):
        if sector not in sector_hash:
            sector_hash[sector] = []
        sector_hash[sector].append(name)

# Build Investor Graph
investor_graph = {}
for _, row in df.iterrows():
    investors = str(row['Investor']).split(',')
    company = row['Company/Brand']
    sector = str(row['Sector']).strip()
    for investor in investors:
        investor = investor.strip()
        if investor and investor.lower() != 'nan':
            if investor not in investor_graph:
                investor_graph[investor] = []
            investor_graph[investor].append((company, sector))

# Funding Sorted DataFrame
funding_sorted = df[['Company/Brand', 'Amount($)']].sort_values(by='Amount($)', ascending=False)
funding_sorted_display = funding_sorted.copy()
funding_sorted_display['Amount($)'] = funding_sorted_display['Amount($)'].apply(lambda x: f"{int(x):,}")

# Main Functions

def search_keyword(keyword):
    """Search startups by keyword in company name."""
    results = df[df['Company/Brand'].str.contains(keyword, case=False, na=False)]
    if results.empty:
        return "No startups found with that keyword."
    results['Amount($)'] = results['Amount($)'].apply(lambda x: f"{int(x):,}")
    return tabulate(results[['Company/Brand', 'Sector', 'Amount($)']], headers='keys', tablefmt='fancy_grid', showindex=False)

def search_investor(investor_name):
    """Search startups by full or partial investor name."""
    investor_name = investor_name.strip().lower()
    matches = [key for key in investor_graph.keys() if investor_name in key.lower()]
    if matches:
        results = []
        for match in matches:
            for company, sector in investor_graph[match]:
                results.append((company, sector))
        return tabulate(results, headers=['Company/Brand', 'Sector'], tablefmt='fancy_grid', showindex=False)
    else:
        return "No startups found for that investor."

def top_sectors(n=5):
    """Return top sectors by number of startups."""
    sector_counts = Counter()
    for sector, startups in sector_hash.items():
        sector_counts[sector] += len(startups)
    top = sector_counts.most_common(n)
    return tabulate([(sector.title(), count) for sector, count in top], headers=['Sector', 'Number of Startups'], tablefmt='fancy_grid', showindex=True)

def top_investors(n=5):
    """Return top investors by number of startups funded."""
    investor_counts = {investor: len(startups) for investor, startups in investor_graph.items()}
    top = Counter(investor_counts).most_common(n)
    return tabulate([(inv, count) for inv, count in top], headers=['Investor', 'Number of Startups Funded'], tablefmt='fancy_grid', showindex=True)

def save_results_to_csv(results_df, filename):
    """Save results DataFrame to CSV."""
    results_df.to_csv(filename, index=False)
    print(f"\n✅ Results saved to {filename}")

def plot_top_sectors(n=5):
    """Plot bar chart for top sectors."""
    sector_counts = Counter()
    for sector, startups in sector_hash.items():
        sector_counts[sector] += len(startups)
    top = sector_counts.most_common(n)

    sectors = [sector.title() for sector, _ in top]
    counts = [count for _, count in top]

    plt.figure(figsize=(10,6))
    plt.bar(sectors, counts, color='skyblue')
    plt.xlabel('Sector')
    plt.ylabel('Number of Startups')
    plt.title(f'Top {n} Sectors by Number of Startups')
    plt.xticks(rotation=45)
    plt.grid(axis='y')
    plt.show()
    print("\n✅ Chart displayed. Returning to menu...")

def dfs_company_investor_graph(start_company):
    """Explore connected companies with DFS based on shared investors."""
    visited = set()
    stack = []

    company_adj_list = {}
    for investor, companies in investor_graph.items():
        company_names = [company for company, _ in companies]
        for i in range(len(company_names)):
            for j in range(i+1, len(company_names)):
                a, b = company_names[i], company_names[j]
                if a not in company_adj_list:
                    company_adj_list[a] = []
                if b not in company_adj_list:
                    company_adj_list[b] = []
                company_adj_list[a].append(b)
                company_adj_list[b].append(a)

    stack.append(start_company)
    print("\n🧭 Connected startups = startups funded by the same investor(s). We explore connections using DFS from your starting company.\n")
    while stack:
        company = stack.pop()
        if company not in visited:
            print(f"- {company}")
            visited.add(company)
            for neighbor in company_adj_list.get(company, []):
                if neighbor not in visited:
                    stack.append(neighbor)

# CLI Loop

while True:
    print("\n=== Startup Search Engine ===")
    print("1. List startups by sector")
    print("2. Search startups by keyword")
    print("3. Show top N funded startups")
    print("4. Search startups by investor")
    print("5. Show top sectors (table)")
    print("6. Show top investors (table)")
    print("7. Save top-funded startups to CSV")
    print("8. Plot top sectors (bar chart)")
    print("9. Explore connected startups (DFS)")
    print("10. Exit")

    choice = input("Choose an option (1–10): ").strip()

    if choice == '1':
        sec = input("Enter sector name: ").strip().lower()
        startups = sector_hash.get(sec)
        if startups:
            print(tabulate([[s] for s in startups], headers=['Company/Brand'], tablefmt='fancy_grid', showindex=True))
        else:
            print("No startups found in that sector.")

    elif choice == '2':
        keyword = input("Enter keyword to search: ").strip()
        print(search_keyword(keyword))

    elif choice == '3':
        try:
            N = int(input("How many top-funded startups to show? ").strip())
            print(tabulate(funding_sorted_display.head(N), headers='keys', tablefmt='fancy_grid', showindex=False))
        except ValueError:
            print("Please enter a valid number.")

    elif choice == '4':
        investor_name = input("Enter part or full investor name: ").strip()
        print(search_investor(investor_name))

    elif choice == '5':
        print(top_sectors(10))

    elif choice == '6':
        print(top_investors(10))

    elif choice == '7':
        try:
            N = int(input("How many startups to save? ").strip())
            filename = input("Filename to save (e.g., top_startups.csv): ").strip()
            save_results_to_csv(funding_sorted.head(N), filename)
        except ValueError:
            print("Please enter a valid number.")

    elif choice == '8':
        try:
            n = int(input("How many top sectors to plot? (e.g., 5): ").strip())
            plot_top_sectors(n)
        except ValueError:
            print("Please enter a valid number.")

    elif choice == '9':
        start_company = input("Enter starting company name for DFS: ").strip()
        dfs_company_investor_graph(start_company)

    elif choice == '10':
        print("Exiting. Thank you for using Startup Search Engine!")
        break

    else:
        print("Invalid choice. Please type a number between 1–10.")


=== Startup Search Engine ===
1. List startups by sector
2. Search startups by keyword
3. Show top N funded startups
4. Search startups by investor
5. Show top sectors (table)
6. Show top investors (table)
7. Save top-funded startups to CSV
8. Plot top sectors (bar chart)
9. Explore connected startups (DFS)
10. Exit

🧭 Connected startups = startups funded by the same investor(s). We explore connections using DFS from your starting company.

- upGrad
- RenewBuy
- NephroPlus
- Lido Learning

=== Startup Search Engine ===
1. List startups by sector
2. Search startups by keyword
3. Show top N funded startups
4. Search startups by investor
5. Show top sectors (table)
6. Show top investors (table)
7. Save top-funded startups to CSV
8. Plot top sectors (bar chart)
9. Explore connected startups (DFS)
10. Exit
