In [2]:
# Luca Soltero
import pandas as pd
import sqlite3

repo_dir = "your path here"
conn = sqlite3.connect(repo_dir)
curs = conn.cursor()
pd.set_option('display.max_columns', None)
# this is our fund sample of interest
qry = """WITH prelim AS (
    SELECT
        dir.dealid,
        dir.investorfundid,
        dir.InvestorInvestmentAmount,
        dir.NumberOfSharesAcquired AS NumberOfSharesAcquired_DIR,
        d.NumberOfSharesAcquired AS NumberOfSharesAcquired_D,
        d.PercentAcquired,
        d.DealSize,
        d.StockSplit,
        d.SeriesOfStock,
        d.DealDate,
        d.CompanyID,
        d.DealID
    FROM
        DealInvestorRelation dir
    LEFT JOIN
        Deal d ON dir.dealid = d.DealID
),
joined AS (
    SELECT
        p.dealid,
        p.InvestorFundID,
        p.InvestorInvestmentAmount,
        p.NumberOfSharesAcquired_DIR,
        p.NumberOfSharesAcquired_D,
        p.PercentAcquired,
        p.StockSplit,
        p.DealSize,
        p.DealDate,
        p.CompanyID,
        p.DealID,
        f.FundCountry,
        CASE
            WHEN p.SeriesOfStock LIKE '%Seed%' THEN 'Seed'
            WHEN p.SeriesOfStock LIKE '%A%' THEN 'A'
            WHEN p.SeriesOfStock LIKE '%B%' THEN 'B'
            WHEN p.SeriesOfStock LIKE '%C%' THEN 'C'
            WHEN p.SeriesOfStock LIKE '%D%' THEN 'D'
            WHEN p.SeriesOfStock LIKE '%E%' THEN 'E'
            WHEN p.SeriesOfStock LIKE '%F%' THEN 'F'
            WHEN p.SeriesOfStock LIKE '%G%' THEN 'G'
            ELSE 'Null'
        END AS SeedType
    FROM
        prelim p
    LEFT JOIN
        Fund f ON p.investorfundid = f.fundid
),
seed_investment_info AS (
    SELECT
        *,
        CASE
            WHEN InvestorInvestmentAmount IS NOT NULL THEN 1
            ELSE 0
        END AS HasInvestedAmount
    FROM
        joined
),
ranked_investments AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY InvestorFundID, CompanyID ORDER BY DealDate) AS Rank
    FROM
        seed_investment_info
)
SELECT
    *
FROM
ranked_investments
    WHERE
    InvestorInvestmentAmount IS NOT NULL AND
    FundCountry = 'United States'"""
df = pd.read_sql_query(qry, conn)
df['DealDate'] = pd.to_datetime(df['DealDate'])
df.rename(columns={'NumberOfSharesAcquired': 'NumberOfSharesAcquired_DIR'}, inplace=True)
# calculate total shares and fund percent acquired
df["TotalShares"] = (df["NumberOfSharesAcquired_D"]/df["PercentAcquired"]) * 100
df["FundPA"] = (df["NumberOfSharesAcquired_DIR"]/df["TotalShares"]) * 100
df["PredictedFundPA_0"] = (df["InvestorInvestmentAmount"]/df["DealSize"]) * df["PercentAcquired"]
checkdf = df.dropna(subset=['TotalShares',"PredictedFundPA_0", "StockSplit"])
print(len(checkdf))

11297


In [3]:
# this query contains all deals
qry2 = """WITH prelim AS (
    SELECT
        dir.dealid,
        dir.investorfundid,
        dir.InvestorInvestmentAmount,
        dir.NumberOfSharesAcquired AS NumberOfSharesAcquired_DIR,
        d.NumberOfSharesAcquired AS NumberOfSharesAcquired_D,
        d.PercentAcquired,
        d.DealSize,
        d.SeriesOfStock,
        d.DealDate,
        d.CompanyID,
        d.DealID,
        d.StockSplit
    FROM
        DealInvestorRelation dir
    LEFT JOIN
        Deal d ON dir.dealid = d.DealID
),
joined AS (
    SELECT
        p.dealid,
        p.investorfundid,
        p.InvestorInvestmentAmount,
        p.StockSplit,
        p.NumberOfSharesAcquired_DIR,
        p.NumberOfSharesAcquired_D,
        p.PercentAcquired,
        p.DealSize,
        p.DealDate,
        p.CompanyID,
        p.DealID,
        f.FundCountry,
        CASE
            WHEN p.SeriesOfStock LIKE '%Seed%' THEN 'Seed'
            WHEN p.SeriesOfStock LIKE '%A%' THEN 'A'
            WHEN p.SeriesOfStock LIKE '%B%' THEN 'B'
            WHEN p.SeriesOfStock LIKE '%C%' THEN 'C'
            WHEN p.SeriesOfStock LIKE '%D%' THEN 'D'
            WHEN p.SeriesOfStock LIKE '%E%' THEN 'E'
            WHEN p.SeriesOfStock LIKE '%F%' THEN 'F'
            WHEN p.SeriesOfStock LIKE '%G%' THEN 'G'
            ELSE 'Null'
        END AS SeedType
    FROM
        prelim p
    LEFT JOIN
        Fund f ON p.investorfundid = f.fundid
),
seed_investment_info AS (
    SELECT
        *,
        CASE
            WHEN InvestorInvestmentAmount IS NOT NULL THEN 1
            ELSE 0
        END AS HasInvestedAmount
    FROM
        joined
),
ranked_investments AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY investorfundid, CompanyID ORDER BY DealDate) AS Rank
    FROM
        seed_investment_info
)
SELECT
    *
FROM
    ranked_investments;"""

df2 = pd.read_sql_query(qry2, conn)
print(len(df2))

1420277


In [4]:
# group by company and remove deals that do not have necessary data
grouped = df.groupby(['CompanyID'])
companies_with_NOTnull_values = grouped.filter(lambda x: (x[['TotalShares',"PredictedFundPA_0", "StockSplit"]].notnull()).all().all().all())
indices_to_include = set(companies_with_NOTnull_values["CompanyID"])
df = df[df['CompanyID'].isin(indices_to_include)]
df2 = df2[df2['CompanyID'].isin(indices_to_include)]
print(len(df["CompanyID"].unique()))
print(len(df2["CompanyID"].unique()))
print(len(df["InvestorFundID"]))

5842
5842
9265


In [5]:
# group by company for each dataframe and compare their date counts as a means to compare their deal count. If they match
# this indicates they have a complete deal history. If they do not than we remove. df2 has complete deal history for all companies so any funds from df (fund sample) that are found to have companies in their portfolio with incomplete histories will be removed.
grouped_rounds = df.groupby("CompanyID")
grouped_second = df2.groupby("CompanyID")

unique_date_counts_first = grouped_rounds['DealDate'].nunique()
unique_date_counts_second = grouped_second['DealDate'].nunique()

merged_counts = pd.merge(unique_date_counts_first, unique_date_counts_second, on='CompanyID', suffixes=('_first','_second'))

print(merged_counts)

total_rows = len(merged_counts)

# Calculate the number of rows where the date counts are not equal
unequal_counts = (merged_counts['DealDate_first'] != merged_counts['DealDate_second']).sum()
equal_counts = (merged_counts['DealDate_first'] == merged_counts['DealDate_second']).sum()
print(equal_counts)
print(equal_counts + unequal_counts)
# Calculate the percentage of time the date counts are not equal
percentage_not_equal = (unequal_counts / total_rows) * 100
print("Percentage of time the date counts are not equal between the two groups:", percentage_not_equal)

           DealDate_first  DealDate_second
CompanyID                                 
100021-87               2                4
100023-85               1                4
100024-12               1                7
100077-31               1                6
100100-26               1                3
...                   ...              ...
99871-84                1                5
99913-42                2                6
99936-10                1                3
99995-95                4                6
99996-40                2                3

[5842 rows x 2 columns]
912
5842
Percentage of time the date counts are not equal between the two groups: 84.3889079082506


In [6]:
# filtering for companies found that have complete deal history
equal_counts_companies = merged_counts[merged_counts['DealDate_first'] == merged_counts['DealDate_second']]
equal_counts_company_list = set(equal_counts_companies.index)
df = df[df['CompanyID'].isin(equal_counts_company_list)]
# this is the scope of our algorithm these companies have complete deal histories
print(df)
print(len(df["InvestorFundID"].unique()))
print(len(df["CompanyID"].unique()))

           dealid InvestorFundID  InvestorInvestmentAmount  \
7       17439-40T      10918-54F                 28.000000   
43      24409-00T      10942-21F                  0.460000   
45      17350-48T      10942-39F                  1.000000   
62      19909-99T      10950-67F                  5.192000   
90      23067-82T      10976-50F                  0.830000   
...           ...            ...                       ...   
27229  244909-36T      24657-76F                  0.100000   
27236  223715-08T      24657-76F                  0.100000   
27238  244909-00T      24657-76F                  0.100000   
27246  213309-19T      24775-48F                  0.086128   
27248  160219-90T      24897-97F                  0.631194   

       NumberOfSharesAcquired_DIR  NumberOfSharesAcquired_D  PercentAcquired  \
7                             NaN                 4049000.0            21.24   
43                            NaN                 3000000.0            15.68   
45             

In [8]:
conn.close()