In [6]:
import zipfile
import os
import pandas as pd
import plotly.express as px
from jupyter_dash import JupyterDash
import dash
from dash import dcc, html, dash_table
from dash.dependencies import Input, Output, State
import dash_bootstrap_components as dbc

zip_file_path = "Data.zip"
extract_path = "extracted_data"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

years = list(range(2014, 2025))
quarters = ["QTR1", "QTR2", "QTR3", "QTR4"]

all_data = []

for year in years:
    for quarter in quarters:
        file_path = os.path.join(extract_path, f"{year}_{quarter}.csv")
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            df["Year"] = year
            df["Quarter"] = quarter
            all_data.append(df)

if not all_data:
    raise ValueError("No data was found. Ensure the ZIP file contains valid CSV files.")
df_filings = pd.concat(all_data, ignore_index=True)


# Store full dataset before filtering (for export)
df_full_data = df_filings.copy()


# Ensure necessary columns exist
required_columns = ["Central Index Key","Year", "Quarter", "State or Country - Full - Physical Location", "Total Amount Offered"]
df_filings = df_filings[required_columns].dropna()

# Rename columns for usability
df_filings = df_filings.rename(columns={"State or Country - Full - Physical Location": "State"})

# Convert "Total Amount Offered" to numeric, handling "Indefinite" cases
df_filings["Total Amount Offered"] = pd.to_numeric(df_filings["Total Amount Offered"], errors="coerce")
df_filings["Total Amount Offered"] = df_filings["Total Amount Offered"].fillna(0)
print(df_filings.head(5))
print(df_full_data.head(5))

app = JupyterDash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

app.layout = dbc.Container([
    html.H1("SEC D-Type Filings Filter", className="text-center mt-4"),

    dbc.Row([
        dbc.Col([
            dcc.Dropdown(
                id="year-dropdown",
                options=[{"label": str(y), "value": y} for y in sorted(df_filings["Year"].unique())],
                placeholder="Select Year",
                multi=True
            ),
        ], width=3),

        dbc.Col([
            dcc.Dropdown(
                id="quarter-dropdown",
                options=[{"label": q, "value": q} for q in sorted(df_filings["Quarter"].unique())],
                placeholder="Select Quarter",
                multi=True
            ),
        ], width=3),

        dbc.Col([
            dcc.Dropdown(
                id="state-dropdown",
                options=[{"label": s, "value": s} for s in sorted(df_filings["State"].dropna().unique())],
                placeholder="Select State",
                multi=True
            ),
        ], width=3),
    ], className="mb-3"),

    dbc.Row([
    dbc.Col([
        html.Label("Total Amount Offered (USD)", className="font-weight-bold"), 
        dcc.RangeSlider(
            id="amount-slider",
            min=df_filings["Total Amount Offered"].min(),
            max=df_filings["Total Amount Offered"].max(),
            step=10000,
            marks={int(x): f"${x:,}" for x in range(0, int(df_filings["Total Amount Offered"].max()), 50000000)},
            value=[df_filings["Total Amount Offered"].min(), df_filings["Total Amount Offered"].max()],
            tooltip={"placement": "bottom", "always_visible": True}  
        ),
        html.Div(id="amount-slider-output", className="text-center mt-2 font-italic"), 
    ], width=9),
], className="mb-3"),


    html.Div(id="data-count", className="text-center mt-3 mb-3 font-weight-bold"),

    dash_table.DataTable(
        id="table",
        columns=[{"name": col, "id": col} for col in df_filings.columns],
        page_size=0,
        style_table={"overflowX": "auto"}
    ),

    html.Button("Export to CSV", id="export-btn", n_clicks=0, className="btn btn-primary mt-3"),
    dcc.Download(id="download-dataframe-csv")
])

@app.callback(
    Output("table", "data"),
    Output("data-count", "children"),
    Input("year-dropdown", "value"),
    Input("quarter-dropdown", "value"),
    Input("state-dropdown", "value"),
    Input("amount-slider", "value"),
)
def update_table(selected_years, selected_quarters, selected_states, amount_range):
    filtered_df = df_filings  # Only filtering on selected columns

    if selected_years:
        filtered_df = filtered_df[filtered_df["Year"].isin(selected_years)]
    if selected_quarters:
        filtered_df = filtered_df[filtered_df["Quarter"].isin(selected_quarters)]
    if selected_states:
        filtered_df = filtered_df[filtered_df["State"].isin(selected_states)]
    if amount_range:
        filtered_df = filtered_df[
            (filtered_df["Total Amount Offered"] >= amount_range[0]) &
            (filtered_df["Total Amount Offered"] <= amount_range[1])
        ]

    count_text = f"Total Filings Found: {len(filtered_df)}"
    return filtered_df.to_dict("records"), count_text

@app.callback(
    Output("download-dataframe-csv", "data"),
    Input("export-btn", "n_clicks"),  # Only triggers on button click
    State("year-dropdown", "value"),
    State("quarter-dropdown", "value"),
    State("state-dropdown", "value"),
    State("amount-slider", "value"),
    prevent_initial_call=True
)
def export_csv(n_clicks, selected_years, selected_quarters, selected_states, amount_range):
    filtered_df = df_filings 

    if selected_years:
        filtered_df = filtered_df[filtered_df["Year"].isin(selected_years)]
    if selected_quarters:
        filtered_df = filtered_df[filtered_df["Quarter"].isin(selected_quarters)]
    if selected_states:
        filtered_df = filtered_df[filtered_df["State"].isin(selected_states)]
    if amount_range:
        filtered_df = filtered_df[
            (filtered_df["Total Amount Offered"] >= amount_range[0]) &
            (filtered_df["Total Amount Offered"] <= amount_range[1])
        ]

    matching_ids = filtered_df["Central Index Key"].unique()  # Extract unique IDs from filtered data
    full_filtered_df = df_full_data[df_full_data["Central Index Key"].isin(matching_ids)]  # Match with full dataset

    if full_filtered_df.empty:
        return dash.no_update 

    return dcc.send_data_frame(full_filtered_df.to_csv, "filtered_sec_filings.csv")



# Run the Dash app inside Jupyter Notebook
app.run_server(mode="inline")


ModuleNotFoundError: No module named 'jupyter_dash'

In [7]:
import zipfile
import pandas as pd
import plotly.express as px
import os
#Data filter for growth in fillings by US state, input the state abbreviation for the filter to work 

years = list(range(2014, 2025))  # From 2014 to 2024
quarters = ["QTR1", "QTR2", "QTR3", "QTR4"]

# State abbreviations and full names
all_states = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]
state_full_names = [
    "ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", 
    "DELAWARE", "FLORIDA", "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", 
    "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS", "MICHIGAN", 
    "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", 
    "NEW JERSEY", "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", 
    "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", "SOUTH CAROLINA", "SOUTH DAKOTA", 
    "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", 
    "WISCONSIN", "WYOMING"
]

state_name_to_abbreviation = dict(zip(state_full_names, all_states))

zip_file_path = "Data.zip"
extract_path = "extracted_data"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

all_data = []

for year in years:
    for quarter in quarters:
        csv_file_name = f"{year}_{quarter}.csv"
        file_path = os.path.join(extract_path, csv_file_name)
        
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)

            jurisdiction_counts = (
                df["State or Country - Legal Jurisdiction"]
                .map(state_name_to_abbreviation)
                .value_counts()
                .reset_index()
            )
            jurisdiction_counts.columns = ["State", "JurisdictionCount"]

            state_counts_simple = (
                df["State or Country - Physical Location"]
                .value_counts()
                .reset_index()
            )
            state_counts_simple.columns = ["State", "TotalCount"]
            state_counts_simple["TotalCount"] = state_counts_simple["TotalCount"].fillna(0).astype(int)

            merged_counts = pd.merge(
                jurisdiction_counts, state_counts_simple, on="State", how="outer"
            ).fillna(0)

            merged_counts["Percentage"] = (
                (merged_counts["JurisdictionCount"] / merged_counts["TotalCount"]) * 100
            ).fillna(0)

            merged_counts["Year"] = year
            merged_counts["Quarter"] = quarter
            
            all_data.append(merged_counts)
final_df = pd.concat(all_data, ignore_index=True)

final_df = final_df.sort_values(by=["State", "Year"])

final_df["PercentageGrowth"] = (
    (final_df["Percentage"] - final_df.groupby("State")["Percentage"].shift(4)) /
    final_df.groupby("State")["Percentage"].shift(4)
) * 100

final_df["PercentageGrowth"] = final_df["PercentageGrowth"].fillna(0)
selected_state = input("Enter the state abbreviation (e.g., CA, TX, NY): ").upper()

state_data = final_df[final_df["State"] == selected_state]

if state_data.empty:
    print(f"No data found for state: {selected_state}")
else:
    
    fig_growth = px.line(
        state_data,
        x="Year",
        y="PercentageGrowth",
        title=f"Yearly Jurisdiction Percentage Growth in {selected_state} (2014-2024)",
        labels={"PercentageGrowth": "Percentage Growth (%)"},
    )


fig_growth.show()



Columns (80,156,166,176,178,186,188,196,198,206,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,323,324,325,326,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450

No data found for state: CA,TX


NameError: name 'fig_growth' is not defined

In [8]:
import zipfile
import pandas as pd
import plotly.express as px
import os
all_states = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]
state_full_names = [
    "ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", 
    "DELAWARE", "FLORIDA", "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", 
    "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS", "MICHIGAN", 
    "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", 
    "NEW JERSEY", "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", 
    "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", "SOUTH CAROLINA", "SOUTH DAKOTA", 
    "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", 
    "WISCONSIN", "WYOMING"
]

zip_file_path = "Data.zip"
csv_file_name = "2022_QTR4.csv"
extract_path = "extracted_data" 
zip_file_path = "Data.zip"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    file_list = zip_ref.namelist()

print("Files inside ZIP:", file_list)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extract(csv_file_name, "extracted_data")

df = pd.read_csv(f"extracted_data/{csv_file_name}")

state_name_to_abbreviation = dict(zip(state_full_names, all_states))

jurisdiction_counts = (
    df["State or Country - Legal Jurisdiction"]
    .map(state_name_to_abbreviation)
    .value_counts()  
    .reset_index()
)
jurisdiction_counts.columns = ["State", "JurisdictionCount"]
state_counts_simple = (
    df["State or Country - Physical Location"]
    .value_counts()
    .reset_index()
)
state_counts_simple.columns = ["State", "TotalCount"]
state_counts_simple["TotalCount"] = state_counts_simple["TotalCount"].fillna(0).astype(int)
print(state_counts_simple)

merged_counts = pd.merge(
    jurisdiction_counts, state_counts_simple, on="State", how="outer"
).fillna(0)

merged_counts["Percentage"] = (
    (merged_counts["JurisdictionCount"] / merged_counts["TotalCount"]) * 100
).fillna(0)

merged_counts = merged_counts[["State", "Percentage"]]
print(merged_counts.head(55))

fig_simple = px.choropleth(
    merged_counts,
    locations="State",
    locationmode="USA-states",
    color="Percentage",
    range_color=(0,100),
    scope="usa",
    color_continuous_scale="Viridis",
    title="Count of D type fillings by US state, state of filling vs company location"
)

fig_simple.show()

Files inside ZIP: ['2014_QTR1.csv', '2014_QTR2.csv', '2014_QTR3.csv', '2014_QTR4.csv', '2015_QTR1.csv', '2015_QTR2.csv', '2015_QTR3.csv', '2015_QTR4.csv', '2016_QTR1.csv', '2016_QTR2.csv', '2016_QTR3.csv', '2016_QTR4.csv', '2017_QTR1.csv', '2017_QTR2.csv', '2017_QTR3.csv', '2017_QTR4.csv', '2018_QTR1.csv', '__MACOSX/._2018_QTR1.csv', '2018_QTR2.csv', '2018_QTR3.csv', '2018_QTR4.csv', '2019_QTR1.csv', '2019_QTR2.csv', '2019_QTR3.csv', '2019_QTR4.csv', '2020_QTR1.csv', '2020_QTR2.csv', '2020_QTR3.csv', '2020_QTR4.csv', '2021_QTR1.csv', '2021_QTR2.csv', '2021_QTR3.csv', '2021_QTR4.csv', '2022_QTR1.csv', '2022_QTR2.csv', '2022_QTR3.csv', '2022_QTR4.csv', '2023_QTR1.csv', '2023_QTR2.csv', '2023_QTR3.csv', '2023_QTR4.csv', '2024_QTR1.csv', '2024_QTR2.csv', '2024_QTR3.csv', '2024_QTR4.csv', '__MACOSX/._2024_QTR4.csv']



Columns (67,154,164,169,180,182,190,192,200,202,206,210,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,442,443,444,445,446,447,448

    State  TotalCount
0      WA        1251
1      CA        1200
2      NY         991
3      TX         769
4      FL         446
..    ...         ...
102    J5           1
103    R1           1
104    J3           1
105    1M           1
106    R5           1

[107 rows x 2 columns]
   State   Percentage
0     1M     0.000000
1     2M     0.000000
2     A0     0.000000
3     A1     0.000000
4     A3     0.000000
5     A5     0.000000
6     A6     0.000000
7     A8     0.000000
8     A9     0.000000
9     AK   114.285714
10    AL    33.333333
11    AR    78.571429
12    AZ    38.383838
13    C0     0.000000
14    C1     0.000000
15    C3     0.000000
16    C4     0.000000
17    C7     0.000000
18    C8     0.000000
19    CA     9.833333
20    CO    34.482759
21    CT     5.785124
22    D0     0.000000
23    D5     0.000000
24    D8     0.000000
25    DC     0.000000
26    DE  2095.000000
27    E9     0.000000
28    F3     0.000000
29    F4     0.000000
30    F8     0.000000
31    FL

In [9]:
import pandas as pd 
import zipfile
import  plotly.express as px
import plotly.graph_objects as go
zip_file_path = "Data.zip"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall("extracted_data") 


csv_file_path = "extracted_data/2022_QTR3.csv"
df = pd.read_csv(csv_file_path)
total_income_by_state = (
    df.groupby(df["State or Country - Physical Location"])["Total Amount Sold So Far"]
    .sum()
    .reset_index()
)

total_income_by_state.columns = ["State", "TotalIncome" ]

fig_income = px.choropleth(
    total_income_by_state,
    locations="State",
    locationmode="USA-states",
    color="TotalIncome",
    scope="usa",
    color_continuous_scale="Viridis",
    title="Total Income Raised by State",
    labels={"TotalIncome": "Total Income ($)"}
)

fig_income.show()


Columns (123,125,133,135,143,145,149,157,159,161,162,163,171,173,181,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,323,324,325,326,327,328,329,330,331,333,334,335,336,337,338,339,340,341,343,344,345,346,347,348,349,350,351,353,354,355,356,357,358,359,360,361,362,363,364,365,367,368,369,370,371,372,373,374,375,377,378,379,380,381,382,383,384,385,387,388,389,390,391,392,393,394,395,397,398,399,400,401,402,403,404,405,407,408,409,410,411,412,413,414,415,417,418,419,420,421,422,423,424,42

In [14]:
industries = ["Agriculture", "Banking & Financial Services", "Commercial Banking", "Insurance", "Investing", "Investment Banking", "Pooled Investment Fund", "Hedge Fund", "Private Equity Fund", "Venture Capital Fund", "Other Investment Fund", "Other Banking & Financial Services", "Business Services", "Energy", "Coal Mining", "Electric Utilities", "Energy Conservation", "Environmental Services", "Oil & Gas", "Other Energy", "Health Care", "Biotechnology", "Health Insurance", "Hospitals & Physicians", "Pharmaceuticals", "Other Health Care", "Manufacturing", "Real Estate", "Commercial", "Construction", "REITS & Finance", "Residential", "Other Real Estate", "Retailing", "Restaurants", "Technology", "Computers", "Telecommunications", "Other Technology", "Travel", "Airlines & Airports", "Lodging & Conventions", "Tourism & Travel Services", "Other Travel", "Other"]
df["Total Amount Remaining to be sold"] = df["Total Amount Remaining to be sold"].astype(str)
df["Total Amount Offered"] = df["Total Amount Offered"].replace("Indefinite", float('inf')).astype(float)

def versatile_filter(df):
    global filtered_df
    # Create an "all True" Series that we can use as our starting mask
    mask_state = pd.Series(True, index=df.index)
    mask_subed = pd.Series(True, index=df.index)
    mask_amount = pd.Series(True, index=df.index)
    mask_equity = pd.Series(True, index=df.index)
    mask_industry = pd.Series(True, index=df.index)
    
    applied_filters = []

    # Ask the user whether they want to filter the data at all
    while True:
        user_filter = input("Would you like to filter the data? (yes/no): ").strip().lower()
        if user_filter in ["yes", "no"]:
            break
        else:
            print("Invalid input. Please enter 'yes' or 'no'.")
    if user_filter != "yes":
        print("Displaying sample of the full data:")
        filtered_df = df
        return filtered_df.sample(n=min(10, len(df)))[["Central Index Key", "Name of the Entity", "Address", "Address - Specification", "City", "State or Country - Physical Location", "State or Country - Full - Physical Location", "Zip Code / Postal Code", "Issuer Phone Number", "State or Country - Legal Jurisdiction", "Minimum Investment Amount Accepted", "Total Amount Offered", "Total Amount Sold So Far", "Total Amount Remaining to be sold", "Clarifications Regarding the Offering and Sales amounts", "Is the Entity Offering Equity", "Industry"]]

    # List of available filters
    available_filters = ["state", "fully_subscribed", "amount", "equity", "industry"]

    # Loop: let the user update or add filters until they are satisfied
    while True:
        print("\nAvailable filters:")
        print("  state             : Filter by state abbreviation")
        print("  fully_subscribed  : Filter by fully subscribed offerings")
        print("  amount            : Filter by offered funding amount range")
        print("  equity            : Filter by equity offerings")
        print("  industry          : Filter by industry")
        chosen_filter = input("Which filter would you like to apply/update? (state/fully_subscribed/amount/equity/industry): ").strip().lower()
        
        if chosen_filter not in available_filters:
            print("Invalid filter type. Please choose one of the available filters.")
            continue

        # Filter by state
        if chosen_filter == "state":
            action = input("Would you like to apply or delete the state filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_state = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("State:")]
                print("State filter deleted.")
            else:
                state_abbreviations = input("Enter one or more state abbreviations separated by commas (e.g., CA,TX,NY): ").upper().split(',')
                # Ensure that the abbreviations are valid (assuming all_states is defined)
                state_abbreviations = [abbr.strip() for abbr in state_abbreviations if abbr.strip() in all_states]
                while not state_abbreviations:
                    state_abbreviations = input("Invalid state abbreviations. Please try again (e.g., CA,TX,NY): ").upper().split(',')
                    state_abbreviations = [abbr.strip() for abbr in state_abbreviations if abbr.strip() in all_states]
                # Update the state mask
                mask_state = df["State or Country - Physical Location"].isin(state_abbreviations)
                applied_filters = [f for f in applied_filters if not f.startswith("State:")]
                applied_filters.append(f"State: {', '.join(state_abbreviations)}")
                print(f"State filter set to: {', '.join(state_abbreviations)}")
        
        # Filter by fully subscribed offerings
        elif chosen_filter == "fully_subscribed":
            action = input("Would you like to apply or delete the fully subscribed filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_subed = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("Fully Subscribed:")]
                print("Fully subscribed filter deleted.")
            else:
                subed = input("Are you interested only in fully subscribed offerings? (yes/no): ").strip().lower()
                while subed not in ["yes", "no"]:
                    subed = input("Invalid input. Please enter 'yes' or 'no': ").strip().lower()
                # Note: adjust the condition below based on how your data represents amounts.
                if subed == "yes":
                    mask_subed = (df["Total Amount Remaining to be sold"] == "0")
                    applied_filters = [f for f in applied_filters if not f.startswith("Fully Subscribed:")]
                    applied_filters.append("Fully Subscribed: Yes")
                    print("Filtering for fully subscribed offerings only.")
                else:
                    mask_subed = (df["Total Amount Remaining to be sold"] != "0")
                    applied_filters = [f for f in applied_filters if not f.startswith("Fully Subscribed:")]
                    applied_filters.append("Fully Subscribed: No")
                    print("Filtering for offerings that are not fully subscribed.")
    
        # Filter by funding amount range
        elif chosen_filter == "amount":
            action = input("Would you like to apply or delete the amount filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_amount = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("Amount:")]
                print("Amount filter deleted.")
            else:
                while True:
                    try:
                        min_val = int(input("Enter the minimum funding amount in USD: "))
                        break
                    except ValueError:
                        print("Invalid input. Please enter an integer value for the minimum funding amount.")
                while True:
                    max_input = input("Enter the maximum funding amount in USD (or type 'infinite' for no upper limit): ").strip().lower()
                    if max_input == "infinite":
                        max_val = float('inf')
                        break
                    try:
                        max_val = int(max_input)
                        break
                    except ValueError:
                        print("Invalid input. Please enter an integer value for the maximum funding amount or 'infinite'.")
                if min_val > max_val:
                    print("The minimum funding amount cannot be greater than the maximum. Amount filter not applied.")
                else:
                    mask_amount = (df["Total Amount Offered"] >= min_val) & (df["Total Amount Offered"] <= max_val)
                    applied_filters = [f for f in applied_filters if not f.startswith("Amount:")]
                    applied_filters.append(f"Amount: {min_val} to {max_val}")
                    print(f"Filtering for funding amounts between {min_val} and {max_val} USD.")

        # Filter by equity offerings
        elif chosen_filter == "equity":
            action = input("Would you like to apply or delete the equity filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_equity = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("Equity:")]
                print("Equity filter deleted.")
            else:
                equity = input("Are you interested only in offerings of equity or offerings excluding equity? (only_equity/excluding_equity): ").strip().lower()
                while equity not in ["only_equity", "excluding_equity"]:
                    equity = input("Invalid input. Please enter 'only_equity' or 'excluding_equity': ").strip().lower()
                if equity == "only_equity":
                    mask_equity = df["Is the Entity Offering Equity"] == True
                    applied_filters = [f for f in applied_filters if not f.startswith("Equity:")]
                    applied_filters.append("Equity: Only Equity")
                    print("Filtering for offerings of equity only.")
                else:
                    mask_equity = df["Is the Entity Offering Equity"] != True
                    applied_filters = [f for f in applied_filters if not f.startswith("Equity:")]
                    applied_filters.append("Equity: Excluding Equity")
                    print("Filtering for offerings excluding equity.")
        
        # Filter by industry
        elif chosen_filter == "industry":
            action = input("Would you like to apply or delete the industry filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_industry = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("Industry:")]
                print("Industry filter deleted.")
            else:
                while True:
                    industry_input = input("Enter one or more industries separated by commas (or type 'list' to see all industries) - be case sensitive: ").strip()
                    if industry_input.lower() == "list":
                        print("Available industries:")
                        for industry in industries:
                            print(f"  {industry}")
                        continue
                    industry_list = [ind.strip() for ind in industry_input.split(',') if ind.strip() in industries]
                    if not industry_list:
                        print("Invalid industries. Please try again.")
                    else:
                        break
                mask_industry = df["Industry"].isin(industry_list)
                applied_filters = [f for f in applied_filters if not f.startswith("Industry:")]
                applied_filters.append(f"Industry: {', '.join(industry_list)}")
                print(f"Industry filter set to: {', '.join(industry_list)}")

        # Combine all the masks
        overall_mask = mask_state & mask_subed & mask_amount & mask_equity & mask_industry

        # Show a sample of the filtered data
        filtered_df = df[overall_mask]
        if filtered_df.empty:
            print("\nNo data matches the current filter criteria.")
        else:
            print("\nHere is a sample of the filtered data:")
            print("Applied Filters- " + "; ".join(applied_filters))
            display(filtered_df.sample(n=min(10, len(filtered_df)))[["Central Index Key", "Name of the Entity", "Address", "Address - Specification", "City", "State or Country - Physical Location", "State or Country - Full - Physical Location", "Zip Code / Postal Code", "Issuer Phone Number", "State or Country - Legal Jurisdiction", "Minimum Investment Amount Accepted", "Total Amount Offered", "Total Amount Sold So Far", "Total Amount Remaining to be sold", "Clarifications Regarding the Offering and Sales amounts", "Is the Entity Offering Equity", "Industry"]])
        
        # Ask if the user wants to apply/update another filter
        continue_filter = input("Would you like to apply/update another filter? (yes/no): ").strip().lower()
        while continue_filter not in ["yes", "no"]:
                continue_filter = input("Invalid input. Please enter 'yes' or 'no': ").strip().lower()
        if continue_filter != "yes":
            break

    print("\nDisplaying sample of the final filtered data:")
    return filtered_df.sample(n=min(10, len(filtered_df)))

versatile_filter(df)



Available filters:
  state             : Filter by state abbreviation
  fully_subscribed  : Filter by fully subscribed offerings
  amount            : Filter by offered funding amount range
  equity            : Filter by equity offerings
  industry          : Filter by industry
Invalid filter type. Please choose one of the available filters.

Available filters:
  state             : Filter by state abbreviation
  fully_subscribed  : Filter by fully subscribed offerings
  amount            : Filter by offered funding amount range
  equity            : Filter by equity offerings
  industry          : Filter by industry
State filter set to: CA, TX

Here is a sample of the filtered data:
Applied Filters- State: CA, TX


Unnamed: 0,Central Index Key,Name of the Entity,Address,Address - Specification,City,State or Country - Physical Location,State or Country - Full - Physical Location,Zip Code / Postal Code,Issuer Phone Number,State or Country - Legal Jurisdiction,Minimum Investment Amount Accepted,Total Amount Offered,Total Amount Sold So Far,Total Amount Remaining to be sold,Clarifications Regarding the Offering and Sales amounts,Is the Entity Offering Equity,Industry
8017,1933512,"Sonder SPV N, LP",611 EL CAMINO REAL #202,,SAN CARLOS,CA,CALIFORNIA,94070,(203) 981-6526,DELAWARE,0,4941802.0,4941802,0,,,Pooled Investment Fund
8221,1941188,"Sunbelt Portfolio Investors, LLC",8901 GAYLORD DRIVE,SUITE 100,HOUSTON,TX,TEXAS,77024,832-904-3142,TEXAS,25000,34574647.0,26754637,7820010,,True,Residential
2172,1946623,"Clusiv, Inc.",8911 N CAPITAL OF TEXAS HIGHWAY,SUITE 4200-018,AUSTIN,TX,TEXAS,78759,737-276-1779,DELAWARE,0,3070440.0,3070440,0,,True,Other Technology
420,1943242,"ARC MULTIFAMILY FUND II, LLC",4041 MACARTHUR BLVD,SUITE 400,NEWPORT BEACH,CA,CALIFORNIA,92660,949-439-3539,WYOMING,100000,100000000.0,2500000,97500000,,True,Residential
867,1934572,"Ariva Apts, LLC","C/O ARIVA APTS MGR, LLC",3044 OLD DENTON RD STE 111-222,CARROLLTON,TX,TEXAS,75007,949-296-7502,TEXAS,50000,7910000.0,0,7910000,,True,Other
6845,1938903,"Premier Fund I, a Series of Decile Capital, LP",1308 HARKER AVE,,PALO ALTO,CA,CALIFORNIA,94301,6503914649,DELAWARE,50000,5000000.0,2650000,2350000,,,Pooled Investment Fund
7741,1948772,SYN 17 LP,"5900 BALCONES DRIVE, SUITE 100",,AUSTIN,TX,TEXAS,78731,949-296-7502,TEXAS,100000,1650000.0,1650000,0,,True,Oil and Gas
8058,1939652,"Spatial Laser, Inc.","5830 GRANITE PKWY, STE 100-272",,PLANO,TX,TEXAS,75024,(469) 298-9704,DELAWARE,0,1568031.0,1568031,0,,True,Other Technology
4272,1779797,"Hover Energy, LLC",3811 TURTLE CREEK BOULEVARD,SUITE 560,DALLAS,TX,TEXAS,75219,214-470-3588,DELAWARE,25000,1500000.0,945000,555000,,,Other Energy
3376,1947117,Five Point Natural Gas Yield Fund I LP,825 TOWN AND COUNTRY LN ST 700,,HOUSTON,TX,TEXAS,77024,7133510704,DELAWARE,0,inf,0,Indefinite,,,Pooled Investment Fund



Available filters:
  state             : Filter by state abbreviation
  fully_subscribed  : Filter by fully subscribed offerings
  amount            : Filter by offered funding amount range
  equity            : Filter by equity offerings
  industry          : Filter by industry
Filtering for fully subscribed offerings only.

Here is a sample of the filtered data:
Applied Filters- State: CA, TX; Fully Subscribed: Yes


Unnamed: 0,Central Index Key,Name of the Entity,Address,Address - Specification,City,State or Country - Physical Location,State or Country - Full - Physical Location,Zip Code / Postal Code,Issuer Phone Number,State or Country - Legal Jurisdiction,Minimum Investment Amount Accepted,Total Amount Offered,Total Amount Sold So Far,Total Amount Remaining to be sold,Clarifications Regarding the Offering and Sales amounts,Is the Entity Offering Equity,Industry
9443,1935170,datacy Inc.,1043 Garland Ave,Unit C #914,SAN JOSE,CA,CALIFORNIA,95126,408-647-4412,DELAWARE,0,8294823.0,8294823,0,,True,Other Technology
2686,1753844,"Dubfrequency, Inc.",3343 LA CIENEGA PLACE,,LOS ANGELES,CA,CALIFORNIA,90016,219-228-1955,CALIFORNIA,7,20921961.0,20921961,0,,True,Other Technology
7004,1940922,Quotebeam Inc.,5264 ROMFORD DR.,,SAN JOSE,CA,CALIFORNIA,95124,747-302-4552,DELAWARE,0,3570000.0,3570000,0,"Total includes $450,000 in cancellation of ind...",True,Other Technology
3895,1882701,"Gradual,Inc.",74 WESTLINE DR.,,DALY CITY,CA,CALIFORNIA,94015,2066600628,DELAWARE,0,445000.0,445000,0,,,Computers
7508,1934768,"SEP Pearland SC, LP",3737 BUFFALO SPEEDWAY,SUITE 1850,HOUSTON,TX,TEXAS,77098,7132213751,TEXAS,0,2930000.0,2930000,0,,,Pooled Investment Fund
7960,1944305,Sloane Street Partners LLC,1405 El Camino Real #845,,Redwood City,CA,CALIFORNIA,94063,424-216-0194,DELAWARE,11550,672000.0,672000,0,,,Pooled Investment Fund
8810,1941609,"USRC Quebec Street, LLC",5851 LEGACY CIR,SUITE 900,PLANO,TX,TEXAS,75024,2147362700,DELAWARE,0,4728921.0,4728921,0,The Members may be required to make capital co...,True,Other Health Care
6492,1942947,PHCC LLC,1717 MAIN STREET,SUITE 3900,DALLAS,TX,TEXAS,75201,214-389-0832,DELAWARE,100000,143219142.0,143219142,0,,,Other Banking and Financial Services
4444,1947582,"Imaige, Inc.",6195 TAMILYNN STREET,,SAN DIEGO,CA,CALIFORNIA,92122,6197261725,DELAWARE,100000,1057000.0,1057000,0,,True,Other Technology
6432,1938073,PCIP LT Granite Mountain Series,15725 N DALLAS PARKWAY,SUITE 230,ADDISON,TX,TEXAS,75001,9728667577,DELAWARE,7500,15484891.0,15484891,0,,True,Commercial



Available filters:
  state             : Filter by state abbreviation
  fully_subscribed  : Filter by fully subscribed offerings
  amount            : Filter by offered funding amount range
  equity            : Filter by equity offerings
  industry          : Filter by industry
Filtering for funding amounts between 100000 and 1000000000 USD.

Here is a sample of the filtered data:
Applied Filters- State: CA, TX; Fully Subscribed: Yes; Amount: 100000 to 1000000000


Unnamed: 0,Central Index Key,Name of the Entity,Address,Address - Specification,City,State or Country - Physical Location,State or Country - Full - Physical Location,Zip Code / Postal Code,Issuer Phone Number,State or Country - Legal Jurisdiction,Minimum Investment Amount Accepted,Total Amount Offered,Total Amount Sold So Far,Total Amount Remaining to be sold,Clarifications Regarding the Offering and Sales amounts,Is the Entity Offering Equity,Industry
2937,1938636,"Enclave Firewheel III Preferred Investors, LLC","3110 W. SOUTHLAKE BLVD., SUITE 120",,SOUTHLAKE,TX,TEXAS,76092,(817) 837-0564,TEXAS,100000,1000000.0,1000000,0,,True,Other Real Estate
9064,1823583,Vizion Inc.,3445 Greer Road,,Palo Alto,CA,CALIFORNIA,94303,(510) 356-7811,DELAWARE,0,12912459.0,12912459,0,,True,Other Technology
8233,1941740,"Sunstone Diassess I, LLC",18881 VON KARMAN AVE.,SUITE 1000,IRVINE,CA,CALIFORNIA,92612,949-771-1764,DELAWARE,1,505050.0,505050,0,,,Pooled Investment Fund
6851,1743493,"Prevailion, Inc.",9950 Woodlock Forest Drive,Suite 1325,The Woodlands,TX,TEXAS,77380,8779109274,DELAWARE,117456,1802507.0,1802507,0,,,Other
7175,1865387,RX REDEFINED INC.,"116 W Branch Street, Suite A",,Arroyo Grande,CA,CALIFORNIA,93420,510-213-2261,DELAWARE,0,7999996.0,7999996,0,,True,Other Health Care
6154,1941044,"OP Green Tree, LLC","OP Group Manager, LLC","12277 Soaring Way, Suite 205",Truckee,CA,CALIFORNIA,96161,530.448.8364,COLORADO,50000,2350000.0,2350000,0,,True,Commercial
3943,1922260,Greentrail Private Opportunities III LLC,1420 THE STRAND,,MANHATTAN BEACH,CA,CALIFORNIA,90266,9173738327,DELAWARE,0,400000.0,400000,0,,,Pooled Investment Fund
9425,1653502,"Zesty. AI, Inc.",548 MARKET STREET,SUITE 75392,SAN FRANCISCO,CA,CALIFORNIA,94104,925-523-3747,DELAWARE,1,19999972.0,19999972,0,,True,Other Technology
7117,1946349,"RLV22, LLC",6115 OWENS ST,STE 201,DALLAS,TX,TEXAS,75235,214-264-5033,TEXAS,100000,4500000.0,4500000,0,,True,Other Real Estate
5950,1846636,"Neptune Topco Holdings, LLC","C/O NOVARIA HOLDINGS, LLC",6625 IRON HORSE BLVD,NORTH RICHLAND HILLS,TX,TEXAS,76180,817-381-3810,DELAWARE,0,241671.0,241671,0,,True,Other



Available filters:
  state             : Filter by state abbreviation
  fully_subscribed  : Filter by fully subscribed offerings
  amount            : Filter by offered funding amount range
  equity            : Filter by equity offerings
  industry          : Filter by industry
Filtering for offerings of equity only.

Here is a sample of the filtered data:
Applied Filters- State: CA, TX; Fully Subscribed: Yes; Amount: 100000 to 1000000000; Equity: Only Equity


Unnamed: 0,Central Index Key,Name of the Entity,Address,Address - Specification,City,State or Country - Physical Location,State or Country - Full - Physical Location,Zip Code / Postal Code,Issuer Phone Number,State or Country - Legal Jurisdiction,Minimum Investment Amount Accepted,Total Amount Offered,Total Amount Sold So Far,Total Amount Remaining to be sold,Clarifications Regarding the Offering and Sales amounts,Is the Entity Offering Equity,Industry
6506,1943167,PIMPS UP ENERGY DRINK,30025 COTTAGE LANE,,LAKE ELSINORE,CA,CALIFORNIA,92530,310-779-5464,CALIFORNIA,7500000,7500000.0,7500000,0,,True,Other Technology
7912,1933802,Sightcast Investment Series LLC - Rivalry Tech...,909 WIRT ROAD,,HOUSTON,TX,TEXAS,77024,979-676-2645,DELAWARE,5000,1395000.0,1395000,0,,True,Other
4740,1649481,"KITCHENS, INC.",5214F Diamond Hts Blvd #353,,San Francisco,CA,CALIFORNIA,94131,833-375-2253,DELAWARE,150000,150000.0,150000,0,,True,Other Technology
2590,1939650,"Devonshire Home Services Holdings, LLC",11755 WILSHIRE BLVD.,SUITE 1250,LOS ANGELES,CA,CALIFORNIA,90025,917-545-8591,DELAWARE,50000,450000.0,450000,0,,True,Other
6007,1540822,"Next One's On Me, Inc.",1321 Upland Drive,PMB 12331,Houston,TX,TEXAS,77043-4718,(512) 217-7070,DELAWARE,0,4080932.0,4080932,0,Total offering amount and amount sold consists...,True,Other Technology
2885,1947137,"El Camino Encinitas, LLC",750 B STREET,SUITE 3020,SAN DIEGO,CA,CALIFORNIA,92101,619-814-0565,DELAWARE,0,19191955.0,19191955,0,,True,Residential
5950,1846636,"Neptune Topco Holdings, LLC","C/O NOVARIA HOLDINGS, LLC",6625 IRON HORSE BLVD,NORTH RICHLAND HILLS,TX,TEXAS,76180,817-381-3810,DELAWARE,0,241671.0,241671,0,,True,Other
8633,1942726,Tollway Preston Crossing LP,826 MANGO CT,,COPPELL,TX,TEXAS,75019,9372194987,TEXAS,50000,25000000.0,25000000,0,,True,Other
5504,1937133,"MW - Landings 1, LLC","11100 SANTA MONICA BOULEVARD, SUITE 240",,LOS ANGELES,CA,CALIFORNIA,90025,310-773-8694,DELAWARE,20000,5305000.0,5305000,0,,True,Commercial
3640,1939604,GH Hoffman Bastrop LLC,3005 STRATFORD DRIVE,,AUSTIN,TX,TEXAS,78746,5128256140,TEXAS,40000,1200000.0,1200000,0,,True,Residential



Available filters:
  state             : Filter by state abbreviation
  fully_subscribed  : Filter by fully subscribed offerings
  amount            : Filter by offered funding amount range
  equity            : Filter by equity offerings
  industry          : Filter by industry
Industry filter set to: Other

Here is a sample of the filtered data:
Applied Filters- State: CA, TX; Fully Subscribed: Yes; Amount: 100000 to 1000000000; Equity: Only Equity; Industry: Other


Unnamed: 0,Central Index Key,Name of the Entity,Address,Address - Specification,City,State or Country - Physical Location,State or Country - Full - Physical Location,Zip Code / Postal Code,Issuer Phone Number,State or Country - Legal Jurisdiction,Minimum Investment Amount Accepted,Total Amount Offered,Total Amount Sold So Far,Total Amount Remaining to be sold,Clarifications Regarding the Offering and Sales amounts,Is the Entity Offering Equity,Industry
2792,1943254,"EP Streamline Capital Partners, LP",140 EAST BANDERA ROAD,,BOERNE,TX,TEXAS,78006,2106670942,DELAWARE,0,7750000.0,7750000,0,,True,Other
4663,1762649,Jennifer McKay Design Inc.,2233 BARRY AVENUE,,LOS ANGELES,CA,CALIFORNIA,90064,310-968-8052,CALIFORNIA,0,270000.0,270000,0,,True,Other
8091,1787871,"Spotter, Inc.",8950 W Olympic Boulevard,Suite 446,Beverly Hills,CA,CALIFORNIA,90211,(415) 434-9100,DELAWARE,0,9340800.0,9340800,0,,True,Other
6358,1947241,Outpost Technologies Corp,1601 COLORADO AVENUE,,SANTA MONICA,CA,CALIFORNIA,90404,650-735-2045,DELAWARE,0,7134709.0,7134709,0,,True,Other
5086,1679371,"Laxmi Therapeutic Devices, Inc.",163 AERO CAMINO,SUITE A,GOLETA,CA,CALIFORNIA,93117,415-773-4151,DELAWARE,0,8999998.0,8999998,0,,True,Other
652,1946803,All Hands Spirit Works Inc.,4201 MAIN STREET,SUITE 200,HOUSTON,TX,TEXAS,77002-4411,281-844-7103,DELAWARE,24999,2700000.0,2700000,0,,True,Other
4469,1572565,Indoor Harvest Corp,7401 W. SLAUGHTER LANE #5078,,AUSTIN,TX,TEXAS,78739,512-309-1776,TEXAS,25000,1000000.0,1000000,0,,True,Other
7288,1938253,"Regional Market Makers, Inc.","325 CHANNING AVE,",STE. 309,PALO ALTO,CA,CALIFORNIA,94301,52(55) 5000.4206,DELAWARE,0,3000000.0,3000000,0,,True,Other
5193,1934018,LiveWire Investments Series LLC - Marble IV Se...,3224 AMHERST STREET,,HOUSTON,TX,TEXAS,77005,713-276-7392,DELAWARE,5000,3000000.0,3000000,0,,True,Other
5932,1942743,NearU Investors HoldCo LLC,"C/O FREEMAN SPOGLI MANAGEMENT CO., L.P.","11100 SANTA MONICA BLVD., SUITE 1900",LOS ANGELES,CA,CALIFORNIA,90024,310-444-1822,DELAWARE,0,474750000.0,474750000,0,,True,Other



Displaying sample of the final filtered data:


Unnamed: 0,Central Index Key,Name of the Entity,Address,Address - Specification,City,State or Country - Physical Location,State or Country - Full - Physical Location,Zip Code / Postal Code,Issuer Phone Number,State or Country - Legal Jurisdiction,...,Associated Address - Person 63,Associated Address - Specification - Person 63,Associated City - Person 63,Associated State or Country - Person 63,Associated State or Country - Full - Person 63,Associated Zip Code - Person 63,Relationship 1 with the Entity - Person 63,Clarification of the relationship - Person 63,Relationship 2 with the Entity - Person 18,Relationship 2 with the Entity - Person 19
1853,1860608,"CROWDHEALTH, INC.",307 BRIARWOOD TRAIL,,AUSTIN,TX,TEXAS,78746,(312) 450.4850,DELAWARE,...,,,,,,,,,,
1223,1938515,"BRAVE TOPCO, INC.",C/O KIRKLAND & ELLIS LLP,"555 CALIFORNIA STREET, SUITE 3000",SAN FRANCISCO,CA,CALIFORNIA,94104,(415) 439-1400,DELAWARE,...,,,,,,,,,,
5550,1940390,"Maldives Holdings, LLC",7915 SEPULVIDA BLVD,,VAN NUYS,CA,CALIFORNIA,91405,4127627536,DELAWARE,...,,,,,,,,,,
8153,1942298,"Stewardship Partners, LLC","603 MUNGER AVE., SUITE 100-260",,DALLAS,TX,TEXAS,75202,636-489-9149,DELAWARE,...,,,,,,,,,,
8218,1890428,"Sun TopCo, LP","11111 Santa Monica Blvd., Ste. 2000",,Los Angeles,CA,CALIFORNIA,90025,(310) 354-0404,DELAWARE,...,,,,,,,,,,
5226,1947621,Longship Holding Co LLC,750 NORTH ST. PAUL STREET,SUITE 1200,DALLAS,TX,TEXAS,75201,850-443-8442,DELAWARE,...,,,,,,,,,,
5193,1934018,LiveWire Investments Series LLC - Marble IV Se...,3224 AMHERST STREET,,HOUSTON,TX,TEXAS,77005,713-276-7392,DELAWARE,...,,,,,,,,,,
3406,1939643,"Florida Home Services Holdings, LLC",11755 WILSHIRE BLVD.,SUITE 1250,LOS ANGELES,CA,CALIFORNIA,90025,971-545-8591,DELAWARE,...,,,,,,,,,,
2057,1941856,Celina Station LP,826 MANGO CT,,COPPELL,TX,TEXAS,75019,9372194987,TEXAS,...,,,,,,,,,,
9501,1942016,wasted P.B.C.,12030 DONNER PASS ROAD,SUITE 1-366,TRUCKEE,CA,CALIFORNIA,96161,5303864044,DELAWARE,...,,,,,,,,,,


In [15]:
filtered_df.columns = [col.lower() for col in filtered_df.columns]
#adding column customization
def modify_columns(df):
    default_columns = [
        "central index key", "name of the entity", "address", "address - specification", "city",
        "state or country - physical location", "state or country - full - physical location", "zip code / postal code",
        "issuer phone number", "state or country - legal jurisdiction", "minimum investment amount accepted",
        "total amount offered", "total amount sold so far", "total amount remaining to be sold",
        "clarifications regarding the offering and sales amounts", "is the entity offering equity", "industry"
    ]
    all_columns = [
        "central index key", "name of the entity", "address", "address - specification", "city",
        "state or country - physical location", "state or country - full - physical location", "zip code / postal code",
        "issuer phone number", "state or country - legal jurisdiction", "previous names used by the issuer",
        "previous names from edgar", "type of the entity", "type of the entity - specification", "industry",
        "revenue range of the entity", "federal exemption or exclusions claimed by the entity",
        "is the entry an amendment to another filing?", "date of the first sale under this filing",
        "is the offering duration intended to be more than a year?", "is the entity offering equity",
        "is the entity creating or managing a pooled fund", "is the offering part of a business combination transaction?",
        "is the offering part of a business combination transaction? - clarification",
        "minimum investment amount accepted", "total amount offered", "total amount sold so far",
        "total amount remaining to be sold", "clarifications regarding the offering and sales amounts",
        "are non-accredited investors participating in the offering?",
        "number of investors who already invested under current offering", "commision amount paid for sales",
        "finder's fees amount paid", "clarification regarding the finder's fees amount paid",
        "amount of the raised capital already used for a purpose",
        "clarification regarding the amount of the raised capital already used for a purpose",
        "name of the entity representative signing the filing", "name of the issuer entity",
        "name of the person signing", "name of the person signing - affirmation", "title of the person signing",
        "date the form was signed"
    ]
    # Ensure default_columns exist in the dataframe
    default_columns = [col for col in default_columns if col in df.columns]

    # Show the dataframe with the columns specified in default_columns to the user
    print("Current dataframe with default columns:")
    display(df[default_columns].head())

    # Ask the user if they would like to add/remove some of the columns
    while True:
        user_input = input("Would you like to add/remove some of the columns? (yes/no): ").strip().lower()
        if user_input in ["yes", "no"]:
            break
        else:
            print("Invalid input. Please enter 'yes' or 'no'.")

    if user_input == "yes":
        while True:
            action = input("Would you like to add or remove columns? (add/remove): ").strip().lower()
            if action in ["add", "remove"]:
                break
            else:
                print("Invalid input. Please enter 'add' or 'remove'.")
        
        while True:
            if action == "add":
                while True:
                    columns_to_add = input("What columns should be added? Type 'help' to display options, 'all' to add all columns, or 'back' to go back: ").strip().lower()
                    if columns_to_add == "help":
                        print("Available columns to add:")
                        for col in all_columns:
                            if col not in default_columns:
                                print(f"  {col}")
                        continue
                    elif columns_to_add == "all":
                        default_columns = [col for col in all_columns if col in df.columns]
                        print("All columns added.")
                        print("Updated dataframe preview:")
                        display(df[default_columns].head())
                        break
                    elif columns_to_add == "back":
                        break
                    columns_to_add = [col.strip() for col in columns_to_add.split(',') if col.strip() in all_columns and col.strip() not in default_columns]
                    if not columns_to_add:
                        print("Invalid columns. Please try again.")
                    else:
                        default_columns.extend(columns_to_add)
                        print(f"Columns added: {', '.join(columns_to_add)}")
                        print("Updated dataframe preview:")
                        display(df[default_columns].head())
                        break
            
            elif action == "remove":
                while True:
                    columns_to_remove = input("What columns should be removed? Type 'help' to display options or 'back' to go back: ").strip().lower()
                    if columns_to_remove == "help":
                        print("Available columns to remove:")
                        for col in default_columns:
                            print(f"  {col}")
                        continue
                    elif columns_to_remove == "back":
                        break
                    columns_to_remove = [col.strip() for col in columns_to_remove.split(',') if col.strip() in default_columns]
                    if not columns_to_remove:
                        print("Invalid columns. Please try again.")
                    else:
                        default_columns = [col for col in default_columns if col not in columns_to_remove]
                        print(f"Columns removed: {', '.join(columns_to_remove)}")
                        print("Updated dataframe preview:")
                        display(df[default_columns].head())
                        break

            while True:
                more_changes = input("Would you like to add/remove any other columns? (yes/no): ").strip().lower()
                if more_changes in ["yes", "no"]:
                    break
                else:
                    print("Invalid input. Please enter 'yes' or 'no'.")
            if more_changes == "no":
                break
            else:
                action = input("Would you like to add or remove columns? (add/remove): ").strip().lower()

    columns_df = df[default_columns]
    return columns_df

new_df = modify_columns(filtered_df)

Current dataframe with default columns:


Unnamed: 0,central index key,name of the entity,address,address - specification,city,state or country - physical location,state or country - full - physical location,zip code / postal code,issuer phone number,state or country - legal jurisdiction,minimum investment amount accepted,total amount offered,total amount sold so far,total amount remaining to be sold,clarifications regarding the offering and sales amounts,is the entity offering equity,industry
481,1939833,AUTOREX INC,800 ROOSEVELT AVE,STE 200,IRVINE,CA,CALIFORNIA,92620,7145058377,CALIFORNIA,1,3500000.0,3500000,0,,True,Other
652,1946803,All Hands Spirit Works Inc.,4201 MAIN STREET,SUITE 200,HOUSTON,TX,TEXAS,77002-4411,281-844-7103,DELAWARE,24999,2700000.0,2700000,0,,True,Other
1223,1938515,"BRAVE TOPCO, INC.",C/O KIRKLAND & ELLIS LLP,"555 CALIFORNIA STREET, SUITE 3000",SAN FRANCISCO,CA,CALIFORNIA,94104,(415) 439-1400,DELAWARE,0,690634552.0,690634552,0,,True,Other
1481,1825807,Boldt Runners Corp,4665 WEST END ROAD,,ARCATA,CA,CALIFORNIA,95521,707-825-1213,DELAWARE,0,150001.0,150001,0,,True,Other
1819,1872891,"CPC Rentalco, LLC",750 NORTH ST. PAUL STREET,SUITE 1200,DALLAS,TX,TEXAS,75201,214.763.8356,DELAWARE,0,1500000.0,1500000,0,,True,Other


All columns added.
Updated dataframe preview:


Unnamed: 0,central index key,name of the entity,address,address - specification,city,state or country - physical location,state or country - full - physical location,zip code / postal code,issuer phone number,state or country - legal jurisdiction,...,finder's fees amount paid,clarification regarding the finder's fees amount paid,amount of the raised capital already used for a purpose,clarification regarding the amount of the raised capital already used for a purpose,name of the entity representative signing the filing,name of the issuer entity,name of the person signing,name of the person signing - affirmation,title of the person signing,date the form was signed
481,1939833,AUTOREX INC,800 ROOSEVELT AVE,STE 200,IRVINE,CA,CALIFORNIA,92620,7145058377,CALIFORNIA,...,0,,0,,False,AUTOREX INC,SUNG KEUN YANG,"YANG, SUNG KEUN",CEO,2022-07-26
652,1946803,All Hands Spirit Works Inc.,4201 MAIN STREET,SUITE 200,HOUSTON,TX,TEXAS,77002-4411,281-844-7103,DELAWARE,...,0,,0,This amount estimated for Item 16 does not inc...,False,All Hands Spirit Works Inc.,Joshua Sanders,Joshua Sanders,Chief Executive Officer,2022-09-15
1223,1938515,"BRAVE TOPCO, INC.",C/O KIRKLAND & ELLIS LLP,"555 CALIFORNIA STREET, SUITE 3000",SAN FRANCISCO,CA,CALIFORNIA,94104,(415) 439-1400,DELAWARE,...,10161347,,0,,False,"BRAVE TOPCO, INC.",/S/ BRIAN LORING,BRIAN LORING,VICE PRESIDENT,2022-07-11
1481,1825807,Boldt Runners Corp,4665 WEST END ROAD,,ARCATA,CA,CALIFORNIA,95521,707-825-1213,DELAWARE,...,0,,0,,False,Boldt Runners Corp,Peter Diatelevi,Peter Diatelevi,Chief Executive Officer,2022-07-15
1819,1872891,"CPC Rentalco, LLC",750 NORTH ST. PAUL STREET,SUITE 1200,DALLAS,TX,TEXAS,75201,214.763.8356,DELAWARE,...,0,,0,,False,"CPC Rentalco, LLC",Brian F. Hegi,Brian F. Hegi,Authorized Person,2022-08-26


All columns added.
Updated dataframe preview:


Unnamed: 0,central index key,name of the entity,address,address - specification,city,state or country - physical location,state or country - full - physical location,zip code / postal code,issuer phone number,state or country - legal jurisdiction,...,finder's fees amount paid,clarification regarding the finder's fees amount paid,amount of the raised capital already used for a purpose,clarification regarding the amount of the raised capital already used for a purpose,name of the entity representative signing the filing,name of the issuer entity,name of the person signing,name of the person signing - affirmation,title of the person signing,date the form was signed
481,1939833,AUTOREX INC,800 ROOSEVELT AVE,STE 200,IRVINE,CA,CALIFORNIA,92620,7145058377,CALIFORNIA,...,0,,0,,False,AUTOREX INC,SUNG KEUN YANG,"YANG, SUNG KEUN",CEO,2022-07-26
652,1946803,All Hands Spirit Works Inc.,4201 MAIN STREET,SUITE 200,HOUSTON,TX,TEXAS,77002-4411,281-844-7103,DELAWARE,...,0,,0,This amount estimated for Item 16 does not inc...,False,All Hands Spirit Works Inc.,Joshua Sanders,Joshua Sanders,Chief Executive Officer,2022-09-15
1223,1938515,"BRAVE TOPCO, INC.",C/O KIRKLAND & ELLIS LLP,"555 CALIFORNIA STREET, SUITE 3000",SAN FRANCISCO,CA,CALIFORNIA,94104,(415) 439-1400,DELAWARE,...,10161347,,0,,False,"BRAVE TOPCO, INC.",/S/ BRIAN LORING,BRIAN LORING,VICE PRESIDENT,2022-07-11
1481,1825807,Boldt Runners Corp,4665 WEST END ROAD,,ARCATA,CA,CALIFORNIA,95521,707-825-1213,DELAWARE,...,0,,0,,False,Boldt Runners Corp,Peter Diatelevi,Peter Diatelevi,Chief Executive Officer,2022-07-15
1819,1872891,"CPC Rentalco, LLC",750 NORTH ST. PAUL STREET,SUITE 1200,DALLAS,TX,TEXAS,75201,214.763.8356,DELAWARE,...,0,,0,,False,"CPC Rentalco, LLC",Brian F. Hegi,Brian F. Hegi,Authorized Person,2022-08-26


Available columns to add:


In [1]:
# Parsing the XML file to extract the relevant information
from sec_edgar_downloader import Downloader
#needs to comply with sec privacy :(( )
dl = Downloader("test", "test@test.com")
import pandas as pd 
import time
import requests

#sec offers an api to get the daily or quartely indexes, metadata for all fillings
#there is no endpoint to specifically search by form type
#the actual data can only be downloaded with an accesion number and cik
#using cik fetched from daily index, fetch all fillings by company and select those with type D 
#download the actual file using url from the data line(the current data url returns the entire txt, we only need the actual xml file for the filing ) 


#used only for gathering data 
more_data = input("Are you interested in more data? (Y/N): ").strip().upper()

if more_data != 'Y':
    print("Exiting script.")
    import sys
    sys.exit()

year = int(input("Enter the year in which you are interested (between 2000 and 2013): "))

if year < 2000 or year > 2013:
    print("Invalid year. Exiting script.")
    import sys
    sys.exit()
quarter = int(input("Enter the quarter in which you are interested (e.g. 4). "))
month = int(input("Enter the month of interest in numerical format (e.g. 11)."))
day = int(input("Enter the day of interest in numerical format (e.g. 27)."))
date = f"{year}{month}{day}"
base_url = 'https://www.sec.gov/Archives/edgar/daily-index'
index_url = f'{base_url}/{year}/QTR{quarter}/company.{date}.idx'

base_full_index_url ='https://www.sec.gov/Archives/edgar/full-index'
#full_index_url =  f'{base_full_index_url}/{year}/QTR{quarter}/company.idx'

headers = {
    'User-Agent': 'Test (test@test.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov',
    'Connection': 'keep-alive',
}
response = requests.get(index_url, headers=headers)
#response = requests.get(full_index_url, headers=headers)

if response.status_code == 403:
    print("Access denied. Ensure you have a proper User-Agent header.")
    exit()

response.raise_for_status() 


content = response.text

print(content)
lines = content.splitlines()


form_d_filings = []



header = ["Company Name", "Form Type", "CIK", "Date Filed", "File Name"]
#data lines skip 3 for daily index, 7 for full index 
#daily index returns idx file with all the fillings for the day, fixed width format, parse it into pandas 
records = [] 
#daily index formatting 

data_lines= lines[3:]
for line in data_lines:
    if line.strip():  
        company_name = line[:60].strip()
        form_type = line[60:71].strip()
        cik = line[71:82].strip()
        date_filed = line[82:92].strip()
        file_name = line[92:].strip()
        records.append([company_name, form_type, cik, date_filed, file_name])


#quarter index formatting 
'''
data_lines=lines[8:]
for line in data_lines:
    if line.strip():  # Ignore empty lines
        company_name = line[0:59].strip()
        form_type = line[59:72].strip()
        cik = line[72:87].strip()
        date_filed = line[87:102].strip()
        file_name = line[102:].strip()
        
        records.append([company_name, form_type, cik, date_filed, file_name])
        '''
print(records[1])
df = pd.DataFrame(records, columns=header)
print(response.text)
print(len(records))

form_d_df = df[df["Form Type"] == "D"] 
row_dict = form_d_df.iloc[1].to_dict()  
print("Row at index 1 as a dictionary:")
for key, value in row_dict.items():
    print(f"{key}: {value}")
#actual data can be downloaded with CIK and an accession number for the filling
form_d_df['Acession_number'] = form_d_df['File Name'].str.split('/').str[-1].str.replace('-', '').str.replace('.txt', '', regex=False)


base_url = "https://www.sec.gov/Archives/edgar/data/"
headers = {
    'User-Agent': 'Test (test@test.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov',
    'Connection': 'keep-alive',
}

xml_data_list = []
for index, row in form_d_df.iterrows():
    print("working:" + row['CIK'])
    cik = row['CIK']
    accession_number = row['Acession_number']
    url = f"{base_url}{cik}/{accession_number}/primary_doc.xml"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            xml_data_list.append(response.text)
        else:
            xml_data_list.append(f"Failed for {url} with status code {response.status_code}")
    except Exception as e:
        xml_data_list.append(f"Error for {url}: {str(e)}")
print(len(form_d_df))


import xml.etree.ElementTree as ET
import plotly.express as px
import plotly.graph_objects as go
# Definine parsing function
def xml_parse(root):
    def extract_text(element, path):
        found = element.find(path)
        return found.text if found is not None else None

    # Parse primary issuer data
    primary_issuer = root.find(".//primaryIssuer")
    if primary_issuer is None:
        return None
    data_primary_issuer = {
        "Central Index Key": extract_text(primary_issuer, ".//cik"),
        "Name of the Entity": extract_text(primary_issuer, ".//entityName"),
        "Address": extract_text(primary_issuer, ".//street1"),
        "Address - Specification": extract_text(primary_issuer, ".//street2"),
        "City": extract_text(primary_issuer, ".//city"),
        "State or Country - Physical Location": extract_text(primary_issuer, ".//stateOrCountry"),
        "State or Country - Full - Physical Location": extract_text(primary_issuer, ".//stateOrCountryDescription"),
        "Zip Code / Postal Code": extract_text(primary_issuer, ".//zipCode"),
        "Issuer Phone Number": extract_text(primary_issuer, ".//issuerPhoneNumber"),
        "State or Country - Legal Jurisdiction": extract_text(primary_issuer, ".//jurisdictionOfInc"),
        "Previous Names Used by the Issuer": extract_text(primary_issuer, ".//issuerPreviousNameList/value"),
        "Previous Names from EDGAR": extract_text(primary_issuer, ".//edgarPreviousNameList/value"),
        "Type of the Entity": extract_text(primary_issuer, ".//entityType"),
        "Type of the Entity - Specification": extract_text(primary_issuer, ".//entityTypeOtherDesc"),
    }

    # Parse related persons data
    related_persons = root.findall(".//relatedPersonsList/relatedPersonInfo")
    data_related_persons = {}
    for i, related_person in enumerate(related_persons, start=1):
        data_related_persons[f"First Name - Person {i}"] = extract_text(related_person, ".//relatedPersonName/firstName"),
        data_related_persons[f"Last Name - Person {i}"] = extract_text(related_person, ".//relatedPersonName/lastName"),
        data_related_persons[f"Associated Address - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/street1"),
        data_related_persons[f"Associated Address - Specification - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/street2"),
        data_related_persons[f"Associated City - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/city"),
        data_related_persons[f"Associated State or Country - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/stateOrCountry"),
        data_related_persons[f"Associated State or Country - Full - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/stateOrCountryDescription"),
        data_related_persons[f"Associated Zip Code - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/zipCode"),

        relationships = related_person.findall(".//relatedPersonRelationshipList/relationship")
        for j, relationship in enumerate(relationships, start=1):
            data_related_persons[f"Relationship {j} with the Entity - Person {i}"] = relationship.text if relationship is not None else None

        data_related_persons[f"Clarification of the relationship - Person {i}"] = extract_text(related_person, ".//relationshipClarification"),

    # Parse offering data
    offering_data = root.find(".//offeringData")
    data_offering_data = {
        "Industry": extract_text(offering_data, ".//industryGroup/industryGroupType"),
        "Revenue Range of the Entity": extract_text(offering_data, ".//issuerSize/revenueRange"),
        "Federal Exemption or Exclusions Claimed by the Entity": ", ".join([item.text for item in offering_data.findall(".//federalExemptionsExclusions/item") if item is not None]) if offering_data is not None else None,
        "Is the Entry an Amendment to Another Filing?": extract_text(offering_data, ".//typeOfFiling/newOrAmendment/isAmendment"),
        "Date of the First Sale Under this Filing": extract_text(offering_data, ".//typeOfFiling/dateOfFirstSale/value"),
        "Is the Offering Duration Intended to be More Than a Year?": extract_text(offering_data, ".//durationOfOffering/moreThanOneYear"),
        "Is the Entity Offering Equity": extract_text(offering_data, ".//typesOfSecuritiesOffered/isEquityType"),
        "Is the Entity Creating or Managing a Pooled Fund": extract_text(offering_data, ".//typesOfSecuritiesOffered/isPooledInvestmentFundType"),
        "Is the Offering Part of a Business Combination Transaction?": extract_text(offering_data, ".//isPooledInvestmentFundType/isBusinessCombinationTransaction"),
        "Is the Offering Part of a Business Combination Transaction? - Clarification": extract_text(offering_data, ".//businessCombinationTransaction/clarificationOfResponse"),
        "Minimum Investment Amount Accepted": extract_text(offering_data, ".//minimumInvestmentAccepted"),
        #"Entities Compensated for Selling the Securities": extract_text(offering_data, ".//salesCompensationList"),
        # The line above is commented out because we don't see much use in it
        "Total Amount Offered": extract_text(offering_data, ".//offeringSalesAmounts/totalOfferingAmount"),
        "Total Amount Sold So Far": extract_text(offering_data, ".//offeringSalesAmounts/totalAmountSold"),
        "Total Amount Remaining to be sold": extract_text(offering_data, ".//offeringSalesAmounts/totalRemaining"),
        "Clarifications Regarding the Offering and Sales amounts": extract_text(offering_data, ".//offeringSalesAmounts/clarificationOfResponse"),
        "Are Non-accredited Investors Participating in the Offering?": extract_text(offering_data, ".//investors/hasNonAccreditedInvestors"),
        "Number of Investors Who Already Invested Under Current Offering": extract_text(offering_data, ".//investors/totalNumberAlreadyInvested"),
        "Commision Amount Paid for Sales": extract_text(offering_data, ".//salesCommissionsFindersFees/salesCommissions/dollarAmount"),
        "Finder's Fees Amount Paid": extract_text(offering_data, ".//salesCommissionsFindersFees/findersFees/dollarAmount"),
        "Clarification Regarding the Finder's Fees Amount Paid": extract_text(offering_data, ".//salesCommissionsFindersFees/clarificationOfResponse"),
        "Amount of the Raised Capital Already Used for a Purpose": extract_text(offering_data, ".//useOfProceeds/grossProceedsUsed/dollarAmount"),
        "Clarification Regarding the Amount of the Raised Capital Already Used for a Purpose": extract_text(offering_data, ".//useOfProceeds/clarificationOfResponse"),
        "Name of the Entity Representative Signing the Filing": extract_text(offering_data, ".//signatureBlock/authorizedRepresentative"),
        "Name of the Issuer Entity": extract_text(offering_data, ".//signatureBlock/signature/issuerName"),
        "Name of the Person Signing": extract_text(offering_data, ".//signatureBlock/signature/signatureName"),
        "Name of the Person Signing - Affirmation": extract_text(offering_data, ".//signatureBlock/signature/nameOfSigner"),
        "Title of the Person Signing": extract_text(offering_data, ".//signatureBlock/signature/signatureTitle"),
        "Date the Form Was Signed": extract_text(offering_data, ".//signatureBlock/signature/signatureDate")
    }
    # Combine all data
    data = {**data_primary_issuer, **data_related_persons, **data_offering_data}
    return data

data_fin = []

for information in xml_data_list:
    print(information)
    root = ET.fromstring(information)
    data_fin.append(xml_parse(root))

df_main = pd.DataFrame(data_fin)
#pd.set_option('display.max_columns', 500)
# A very inefficient way to reorganize the columns
#df_main.reindex((["Central Index Key", "Name of the Entity", "Address", "Address - Specification", "City", "State or Country - Physical Location", "State or Country - Full - Physical Location", "Zip Code / Postal Code", "Issuer Phone Number", "State or Country - Legal Jurisdiction", "Previous Names Used by the Issuer", "Previous Names from EDGAR", "Type of the Entity", "Type of the Entity - Specification", "Industry", "Revenue Range of the Entity", "Federal Exemption or Exclusions Claimed by the Entity", "Is the Entry an Amendment to Another Filing?", "Date of the First Sale Under this Filing", "Is the Offering Duration Intended to be More Than a Year?", "Is the Entity Offering Equity", "Is the Entity Creating or Managing a Pooled Fund", "Is the Offering Part of a Business Combination Transaction?", "Is the Offering Part of a Business Combination Transaction? - Clarification", "Minimum Investment Amount Accepted", "Entities Compensated for Selling the Securities", "Total Amount Offered", "Total Amount Sold So Far", "Total Amount Remaining to be sold", "Clarifications Regarding the Offering and Sales amounts", "Are Non-accredited Investors Participating in the Offering?", "Number of Investors Who Already Invested Under Current Offering", "Commision Amount Paid for Sales", "Finder's Fees Amount Paid", "Clarification Regarding the Finder's Fees Amount Paid", "Amount of the Raised Capital Already Used for a Purpose", "Clarification Regarding the Amount of the Raised Capital Already Used for a Purpose", "Name of the Entity Representative Signing the Filing", "Name of the Issuer Entity", "Name of the Person Signing", "Name of the Person Signing - Affirmation", "Title of the Person Signing", "Date the Form Was Signed", "First Name - Person 1", "Last Name - Person 1", "Associated Address - Person 1", "Associated Address - Specification - Person 1", "Associated City - Person 1", "Associated State or Country - Person 1", "Associated State or Country - Full - Person 1", "Associated Zip Code - Person 1", "Relationship 1 with the Entity - Person 1", "Relationship 2 with the Entity - Person 1", "Relationship 3 with the Entity - Person 1", "Clarification of the relationship - Person 1", "First Name - Person 2", "Last Name - Person 2", "Associated Address - Person 2", "Associated Address - Specification - Person 2", "Associated City - Person 2", "Associated State or Country - Person 2", "Associated State or Country - Full - Person 2", "Associated Zip Code - Person 2", "Relationship 1 with the Entity - Person 2", "Relationship 2 with the Entity - Person 2", "Clarification of the relationship - Person 2", "First Name - Person 3", "Last Name - Person 3", "Associated Address - Person 3", "Associated Address - Specification - Person 3", "Associated City - Person 3", "Associated State or Country - Person 3", "Associated State or Country - Full - Person 3", "Associated Zip Code - Person 3", "Relationship 1 with the Entity - Person 3", "Relationship 2 with the Entity - Person 3", "Clarification of the relationship - Person 3", "First Name - Person 4", "Last Name - Person 4", "Associated Address - Person 4", "Associated Address - Specification - Person 4", "Associated City - Person 4", "Associated State or Country - Person 4", "Associated State or Country - Full - Person 4", "Associated Zip Code - Person 4", "Relationship 1 with the Entity - Person 4", "Clarification of the relationship - Person 4", "First Name - Person 5", "Last Name - Person 5", "Associated Address - Person 5", "Associated Address - Specification - Person 5", "Associated City - Person 5", "Associated State or Country - Person 5", "Associated State or Country - Full - Person 5", "Associated Zip Code - Person 5", "Relationship 1 with the Entity - Person 5", "Clarification of the relationship - Person 5"]), axis=1)

# Cleaning the data
df_main = df_main.fillna("N/A")
for column in df_main.columns:
    df_main[column] = df_main[column].apply(lambda x: x[0] if isinstance(x, tuple) else x)

df_main = df_main.replace("None", "N/A")

redundant_strings = ["/s/", "/bem/", "c/o", "n/a"]
for x in redundant_strings:
    df_main = df_main.replace(x, "", regex=True)

df_main = df_main.replace("", "N/A")
df_main = df_main.fillna("N/A")
df_main.to_csv("cleaned_data.csv", index=False)






Access denied. Ensure you have a proper User-Agent header.


HTTPError: 403 Client Error: Forbidden for url: https://www.sec.gov/Archives/edgar/daily-index/2012/QTR3/company.2012111.idx

: 