In [None]:
import zipfile;
import os;
import pandas as pd;
import plotly.express as px;
from jupyter_dash import JupyterDash;
import dash;
from dash import dcc, html, dash_table;
from dash.dependencies import Input, Output, State;
import dash_bootstrap_components as dbc;

from IPython.core.interactiveshell import InteractiveShell;

# Disable auto-display of last returned values
InteractiveShell.ast_node_interactivity = "none";
import sys

sys.displayhook = lambda x: None if x is not None else None

zip_file_path = "Data.zip";
extract_path = "extracted_data";
os.makedirs(extract_path, exist_ok=True);

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path);

years = list(range(2014, 2025));
quarters = ["QTR1", "QTR2", "QTR3", "QTR4"];

all_data = [];

for year in years:
    for quarter in quarters:
        file_path = os.path.join(extract_path, f"{year}_{quarter}.csv");
        if os.path.exists(file_path):
            df = pd.read_csv(file_path);
            df["Year"] = year;
            df["Quarter"] = quarter;
            all_data.append(df);

if not all_data:
    raise ValueError("No data was found. Ensure the ZIP file contains valid CSV files.");
df_filings = pd.concat(all_data, ignore_index=True);

df_full_data = df_filings.copy();

required_columns = ["Central Index Key", "Year", "Quarter", "State or Country - Full - Physical Location", "Total Amount Offered"];
df_filings = df_filings[required_columns].dropna();

df_filings = df_filings.rename(columns={"State or Country - Full - Physical Location": "State"});

df_filings["Total Amount Offered"] = pd.to_numeric(df_filings["Total Amount Offered"], errors="coerce");
df_filings["Total Amount Offered"] = df_filings["Total Amount Offered"].fillna(0);

app = JupyterDash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP]);

app.layout = dbc.Container([
    html.H1("SEC D-Type Filings Filter", className="text-center mt-4"),

    dbc.Row([
        dbc.Col([
            dcc.Dropdown(
                id="year-dropdown",
                options=[{"label": str(y), "value": y} for y in sorted(df_filings["Year"].unique())],
                placeholder="Select Year",
                multi=True
            ),
        ], width=3),

        dbc.Col([
            dcc.Dropdown(
                id="quarter-dropdown",
                options=[{"label": q, "value": q} for q in sorted(df_filings["Quarter"].unique())],
                placeholder="Select Quarter",
                multi=True
            ),
        ], width=3),

        dbc.Col([
            dcc.Dropdown(
                id="state-dropdown",
                options=[{"label": s, "value": s} for s in sorted(df_filings["State"].dropna().unique())],
                placeholder="Select State",
                multi=True
            ),
        ], width=3),
    ], className="mb-3"),

    dbc.Row([
        dbc.Col([
            html.Label("Total Amount Offered (USD)", className="font-weight-bold"), 
            dcc.RangeSlider(
                id="amount-slider",
                min=df_filings["Total Amount Offered"].min(),
                max=df_filings["Total Amount Offered"].max(),
                step=10000,
                marks={int(x): f"${x:,}" for x in range(0, int(df_filings["Total Amount Offered"].max()), 50000000)},
                value=[df_filings["Total Amount Offered"].min(), df_filings["Total Amount Offered"].max()],
                tooltip={"placement": "bottom", "always_visible": True}  
            ),
            html.Div(id="amount-slider-output", className="text-center mt-2 font-italic"), 
        ], width=9),
    ], className="mb-3"),

    html.Div(id="data-count", className="text-center mt-3 mb-3 font-weight-bold"),

    dash_table.DataTable(
        id="table",
        columns=[{"name": col, "id": col} for col in df_filings.columns],
        page_size=0,
        style_table={"overflowX": "auto"}
    ),

    html.Button("Export to CSV", id="export-btn", n_clicks=0, className="btn btn-primary mt-3"),
    dcc.Download(id="download-dataframe-csv")
]);

@app.callback(
    Output("table", "data"),
    Output("data-count", "children"),
    Input("year-dropdown", "value"),
    Input("quarter-dropdown", "value"),
    Input("state-dropdown", "value"),
    Input("amount-slider", "value"),
)
def update_table(selected_years, selected_quarters, selected_states, amount_range):
    filtered_df = df_filings;

    if selected_years:
        filtered_df = filtered_df[filtered_df["Year"].isin(selected_years)];
    if selected_quarters:
        filtered_df = filtered_df[filtered_df["Quarter"].isin(selected_quarters)];
    if selected_states:
        filtered_df = filtered_df[filtered_df["State"].isin(selected_states)];
    if amount_range:
        filtered_df = filtered_df[
            (filtered_df["Total Amount Offered"] >= amount_range[0]) &
            (filtered_df["Total Amount Offered"] <= amount_range[1])
        ];

    count_text = f"Total Filings Found: {len(filtered_df)}";
    return filtered_df.to_dict("records"), count_text;

@app.callback(
    Output("download-dataframe-csv", "data"),
    Input("export-btn", "n_clicks"), 
    State("year-dropdown", "value"),
    State("quarter-dropdown", "value"),
    State("state-dropdown", "value"),
    State("amount-slider", "value"),
    prevent_initial_call=True
)
def export_csv(n_clicks, selected_years, selected_quarters, selected_states, amount_range):
    filtered_df = df_filings;

    if selected_years:
        filtered_df = filtered_df[filtered_df["Year"].isin(selected_years)];
    if selected_quarters:
        filtered_df = filtered_df[filtered_df["Quarter"].isin(selected_quarters)];
    if selected_states:
        filtered_df = filtered_df[filtered_df["State"].isin(selected_states)];
    if amount_range:
        filtered_df = filtered_df[
            (filtered_df["Total Amount Offered"] >= amount_range[0]) &
            (filtered_df["Total Amount Offered"] <= amount_range[1])
        ];

    matching_ids = filtered_df["Central Index Key"].unique(); 
    full_filtered_df = df_full_data[df_full_data["Central Index Key"].isin(matching_ids)];  

    if full_filtered_df.empty:
        return dash.no_update; 

    return dcc.send_data_frame(full_filtered_df.to_csv, "filtered_sec_filings.csv");

app.run_server(mode="inline");


In [2]:
import zipfile
import pandas as pd
import plotly.express as px
import os
import sys

sys.displayhook = lambda x: None if x is not None else None

years = list(range(2014, 2025))  # From 2014 to 2024
quarters = ["QTR1", "QTR2", "QTR3", "QTR4"]

all_states = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]
state_full_names = [
    "ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", 
    "DELAWARE", "FLORIDA", "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", 
    "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS", "MICHIGAN", 
    "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", 
    "NEW JERSEY", "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", 
    "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", "SOUTH CAROLINA", "SOUTH DAKOTA", 
    "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", 
    "WISCONSIN", "WYOMING"
]

state_name_to_abbreviation = dict(zip(state_full_names, all_states))

zip_file_path = "Data.zip"
extract_path = "extracted_data"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

all_data = []

for year in years:
    for quarter in quarters:
        csv_file_name = f"{year}_{quarter}.csv"
        file_path = os.path.join(extract_path, csv_file_name)
        
        if os.path.exists(file_path):
            
            df = pd.read_csv(file_path);
            _ = None
            None
            jurisdiction_counts = (
                df["State or Country - Legal Jurisdiction"]
                .map(state_name_to_abbreviation)
                .value_counts()
                .reset_index()
            )
            jurisdiction_counts.columns = ["State", "JurisdictionCount"]

            state_counts_simple = (
                df["State or Country - Physical Location"]
                .value_counts()
                .reset_index()
            )
            state_counts_simple.columns = ["State", "TotalCount"]
            state_counts_simple["TotalCount"] = state_counts_simple["TotalCount"].fillna(0).astype(int)

            merged_counts = pd.merge(
                jurisdiction_counts, state_counts_simple, on="State", how="outer"
            ).fillna(0)

            merged_counts["Percentage"] = (
                (merged_counts["JurisdictionCount"] / merged_counts["TotalCount"]) * 100
            ).fillna(0)

            merged_counts["Year"] = year
            merged_counts["Quarter"] = quarter
            
            all_data.append(merged_counts)
final_df = pd.concat(all_data, ignore_index=True)

final_df = final_df.sort_values(by=["State", "Year"])

final_df["PercentageGrowth"] = (
    (final_df["Percentage"] - final_df.groupby("State")["Percentage"].shift(4)) /
    final_df.groupby("State")["Percentage"].shift(4)
) * 100

final_df["PercentageGrowth"] = final_df["PercentageGrowth"].fillna(0)
selected_state = input("Enter the state abbreviation (e.g., CA, TX, NY): ").upper()

state_data = final_df[final_df["State"] == selected_state]

if state_data.empty:
    print(f"No data found for state: {selected_state}")
else:
    
    fig_growth = px.line(
        state_data,
        x="Year",
        y="PercentageGrowth",
        title=f"Yearly Jurisdiction Percentage Growth in {selected_state} (2014-2024)",
        labels={"PercentageGrowth": "Percentage Growth (%)"},
    )


fig_growth.show()


  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = pd.read_csv(file_path);
  df = p

No data found for state: 


NameError: name 'fig_growth' is not defined

In [2]:
import zipfile;
import pandas as pd;
import plotly.express as px;
import os;

years = list(range(2014, 2025));  # From 2014 to 2024
quarters = ["QTR1", "QTR2", "QTR3", "QTR4"];

all_states = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
    "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
    "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
    "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
    "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
];

state_full_names = [
    "ALABAMA", "ALASKA", "ARIZONA", "ARKANSAS", "CALIFORNIA", "COLORADO", "CONNECTICUT", 
    "DELAWARE", "FLORIDA", "GEORGIA", "HAWAII", "IDAHO", "ILLINOIS", "INDIANA", "IOWA", 
    "KANSAS", "KENTUCKY", "LOUISIANA", "MAINE", "MARYLAND", "MASSACHUSETTS", "MICHIGAN", 
    "MINNESOTA", "MISSISSIPPI", "MISSOURI", "MONTANA", "NEBRASKA", "NEVADA", "NEW HAMPSHIRE", 
    "NEW JERSEY", "NEW MEXICO", "NEW YORK", "NORTH CAROLINA", "NORTH DAKOTA", "OHIO", 
    "OKLAHOMA", "OREGON", "PENNSYLVANIA", "RHODE ISLAND", "SOUTH CAROLINA", "SOUTH DAKOTA", 
    "TENNESSEE", "TEXAS", "UTAH", "VERMONT", "VIRGINIA", "WASHINGTON", "WEST VIRGINIA", 
    "WISCONSIN", "WYOMING"
];

state_name_to_abbreviation = dict(zip(state_full_names, all_states));

zip_file_path = "Data.zip";
extract_path = "extracted_data";

os.makedirs(extract_path, exist_ok=True);

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path);

all_data = [];

for year in years:
    for quarter in quarters:
        csv_file_name = f"{year}_{quarter}.csv";
        file_path = os.path.join(extract_path, csv_file_name);
        
        if os.path.exists(file_path):
            df = pd.read_csv(file_path);
            _ = None;
            jurisdiction_counts = (
                df["State or Country - Legal Jurisdiction"]
                .map(state_name_to_abbreviation)
                .value_counts()
                .reset_index()
            );
            jurisdiction_counts.columns = ["State", "JurisdictionCount"];

            state_counts_simple = (
                df["State or Country - Physical Location"]
                .value_counts()
                .reset_index()
            );
            state_counts_simple.columns = ["State", "TotalCount"];
            state_counts_simple["TotalCount"] = state_counts_simple["TotalCount"].fillna(0).astype(int);

            merged_counts = pd.merge(
                jurisdiction_counts, state_counts_simple, on="State", how="outer"
            ).fillna(0);

            merged_counts["Percentage"] = (
                (merged_counts["JurisdictionCount"] / merged_counts["TotalCount"]) * 100
            ).fillna(0);

            merged_counts["Year"] = year;
            merged_counts["Quarter"] = quarter;
            
            all_data.append(merged_counts);

final_df = pd.concat(all_data, ignore_index=True);

final_df = final_df.sort_values(by=["State", "Year"]);

final_df["PercentageGrowth"] = (
    (final_df["Percentage"] - final_df.groupby("State")["Percentage"].shift(4)) /
    final_df.groupby("State")["Percentage"].shift(4)
) * 100;

final_df["PercentageGrowth"] = final_df["PercentageGrowth"].fillna(0);

selected_state = input("Enter the state abbreviation (e.g., CA, TX, NY): ").upper();

state_data = final_df[final_df["State"] == selected_state];

if state_data.empty:
    print(f"No data found for state: {selected_state}");
else:
    fig_growth = px.line(
        state_data,
        x="Year",
        y="PercentageGrowth",
        title=f"Yearly Jurisdiction Percentage Growth in {selected_state} (2014-2024)",
        labels={"PercentageGrowth": "Percentage Growth (%)"},
    );

fig_growth.show();


In [None]:
import pandas as pd 
import zipfile
import  plotly.express as px
import plotly.graph_objects as go
zip_file_path = "Data.zip"
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall("extracted_data") 


csv_file_path = "extracted_data/2022_QTR3.csv"
df = pd.read_csv(csv_file_path)
total_income_by_state = (
    df.groupby(df["State or Country - Physical Location"])["Total Amount Sold So Far"]
    .sum()
    .reset_index()
)

total_income_by_state.columns = ["State", "TotalIncome" ]

fig_income = px.choropleth(
    total_income_by_state,
    locations="State",
    locationmode="USA-states",
    color="TotalIncome",
    scope="usa",
    color_continuous_scale="Viridis",
    title="Total Income Raised by State",
    labels={"TotalIncome": "Total Income ($)"}
)

fig_income.show()

In [None]:
industries = ["Agriculture", "Banking & Financial Services", "Commercial Banking", "Insurance", "Investing", "Investment Banking", "Pooled Investment Fund", "Hedge Fund", "Private Equity Fund", "Venture Capital Fund", "Other Investment Fund", "Other Banking & Financial Services", "Business Services", "Energy", "Coal Mining", "Electric Utilities", "Energy Conservation", "Environmental Services", "Oil & Gas", "Other Energy", "Health Care", "Biotechnology", "Health Insurance", "Hospitals & Physicians", "Pharmaceuticals", "Other Health Care", "Manufacturing", "Real Estate", "Commercial", "Construction", "REITS & Finance", "Residential", "Other Real Estate", "Retailing", "Restaurants", "Technology", "Computers", "Telecommunications", "Other Technology", "Travel", "Airlines & Airports", "Lodging & Conventions", "Tourism & Travel Services", "Other Travel", "Other"]
df["Total Amount Remaining to be sold"] = df["Total Amount Remaining to be sold"].astype(str)
df["Total Amount Offered"] = df["Total Amount Offered"].replace("Indefinite", float('inf')).astype(float)

def versatile_filter(df):
    """
    Filters a DataFrame based on user-specified criteria.
    Parameters:
    df (pd.DataFrame): The DataFrame to be filtered.
    Returns:
    pd.DataFrame: A sample of the filtered DataFrame based on the applied filters.
    The function allows the user to apply multiple filters to the DataFrame, including:
    - State or Country - Physical Location
    - Fully subscribed offerings
    - Funding amount range
    - Equity offerings
    - Industry
    The user can choose to apply or delete each filter and can continue to update filters until satisfied.
    The function displays a sample of the filtered data after each filter is applied and returns a sample of the final filtered data.
    """
    global filtered_df
    mask_state = pd.Series(True, index=df.index)
    mask_subed = pd.Series(True, index=df.index)
    mask_amount = pd.Series(True, index=df.index)
    mask_equity = pd.Series(True, index=df.index)
    mask_industry = pd.Series(True, index=df.index)
    
    applied_filters = []

    while True:
        user_filter = input("Would you like to filter the data? (yes/no): ").strip().lower()
        if user_filter in ["yes", "no"]:
            break
        else:
            print("Invalid input. Please enter 'yes' or 'no'.")
    if user_filter != "yes":
        print("Displaying sample of the full data:")
        filtered_df = df
        return filtered_df.sample(n=min(10, len(df)))[["Central Index Key", "Name of the Entity", "Address", "Address - Specification", "City", "State or Country - Physical Location", "State or Country - Full - Physical Location", "Zip Code / Postal Code", "Issuer Phone Number", "State or Country - Legal Jurisdiction", "Minimum Investment Amount Accepted", "Total Amount Offered", "Total Amount Sold So Far", "Total Amount Remaining to be sold", "Clarifications Regarding the Offering and Sales amounts", "Is the Entity Offering Equity", "Industry"]]

    available_filters = ["state", "fully_subscribed", "amount", "equity", "industry"]

    while True:
        print("\nAvailable filters:")
        print("  state             : Filter by state abbreviation")
        print("  fully_subscribed  : Filter by fully subscribed offerings")
        print("  amount            : Filter by offered funding amount range")
        print("  equity            : Filter by equity offerings")
        print("  industry          : Filter by industry")
        chosen_filter = input("Which filter would you like to apply/update? (state/fully_subscribed/amount/equity/industry): ").strip().lower()
        
        if chosen_filter not in available_filters:
            print("Invalid filter type. Please choose one of the available filters.")
            continue

        if chosen_filter == "state":
            action = input("Would you like to apply or delete the state filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_state = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("State:")]
                print("State filter deleted.")
            else:
                state_abbreviations = input("Enter one or more state abbreviations separated by commas (e.g., CA,TX,NY): ").upper().split(',')
                state_abbreviations = [abbr.strip() for abbr in state_abbreviations if abbr.strip() in all_states]
                while not state_abbreviations:
                    state_abbreviations = input("Invalid state abbreviations. Please try again (e.g., CA,TX,NY): ").upper().split(',')
                    state_abbreviations = [abbr.strip() for abbr in state_abbreviations if abbr.strip() in all_states]
                mask_state = df["State or Country - Physical Location"].isin(state_abbreviations)
                applied_filters = [f for f in applied_filters if not f.startswith("State:")]
                applied_filters.append(f"State: {', '.join(state_abbreviations)}")
                print(f"State filter set to: {', '.join(state_abbreviations)}")
        
        elif chosen_filter == "fully_subscribed":
            action = input("Would you like to apply or delete the fully subscribed filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_subed = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("Fully Subscribed:")]
                print("Fully subscribed filter deleted.")
            else:
                subed = input("Are you interested only in fully subscribed offerings? (yes/no): ").strip().lower()
                while subed not in ["yes", "no"]:
                    subed = input("Invalid input. Please enter 'yes' or 'no': ").strip().lower()
                if subed == "yes":
                    mask_subed = (df["Total Amount Remaining to be sold"] == "0")
                    applied_filters = [f for f in applied_filters if not f.startswith("Fully Subscribed:")]
                    applied_filters.append("Fully Subscribed: Yes")
                    print("Filtering for fully subscribed offerings only.")
                else:
                    mask_subed = (df["Total Amount Remaining to be sold"] != "0")
                    applied_filters = [f for f in applied_filters if not f.startswith("Fully Subscribed:")]
                    applied_filters.append("Fully Subscribed: No")
                    print("Filtering for offerings that are not fully subscribed.")
    
        elif chosen_filter == "amount":
            action = input("Would you like to apply or delete the amount filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_amount = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("Amount:")]
                print("Amount filter deleted.")
            else:
                while True:
                    try:
                        min_val = int(input("Enter the minimum funding amount in USD: "))
                        break
                    except ValueError:
                        print("Invalid input. Please enter an integer value for the minimum funding amount.")
                while True:
                    max_input = input("Enter the maximum funding amount in USD (or type 'infinite' for no upper limit): ").strip().lower()
                    if max_input == "infinite":
                        max_val = float('inf')
                        break
                    try:
                        max_val = int(max_input)
                        break
                    except ValueError:
                        print("Invalid input. Please enter an integer value for the maximum funding amount or 'infinite'.")
                if min_val > max_val:
                    print("The minimum funding amount cannot be greater than the maximum. Amount filter not applied.")
                else:
                    mask_amount = (df["Total Amount Offered"] >= min_val) & (df["Total Amount Offered"] <= max_val)
                    applied_filters = [f for f in applied_filters if not f.startswith("Amount:")]
                    applied_filters.append(f"Amount: {min_val} to {max_val}")
                    print(f"Filtering for funding amounts between {min_val} and {max_val} USD.")

        elif chosen_filter == "equity":
            action = input("Would you like to apply or delete the equity filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_equity = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("Equity:")]
                print("Equity filter deleted.")
            else:
                equity = input("Are you interested only in offerings of equity or offerings excluding equity? (only_equity/excluding_equity): ").strip().lower()
                while equity not in ["only_equity", "excluding_equity"]:
                    equity = input("Invalid input. Please enter 'only_equity' or 'excluding_equity': ").strip().lower()
                if equity == "only_equity":
                    mask_equity = df["Is the Entity Offering Equity"] == True
                    applied_filters = [f for f in applied_filters if not f.startswith("Equity:")]
                    applied_filters.append("Equity: Only Equity")
                    print("Filtering for offerings of equity only.")
                else:
                    mask_equity = df["Is the Entity Offering Equity"] != True
                    applied_filters = [f for f in applied_filters if not f.startswith("Equity:")]
                    applied_filters.append("Equity: Excluding Equity")
                    print("Filtering for offerings excluding equity.")
        
        elif chosen_filter == "industry":
            action = input("Would you like to apply or delete the industry filter? (apply/delete): ").strip().lower()
            while action not in ["apply", "delete"]:
                action = input("Invalid input. Please enter 'apply' or 'delete': ").strip().lower()
            if action == "delete":
                mask_industry = pd.Series(True, index=df.index)
                applied_filters = [f for f in applied_filters if not f.startswith("Industry:")]
                print("Industry filter deleted.")
            else:
                while True:
                    industry_input = input("Enter one or more industries separated by commas (or type 'list' to see all industries) - be case sensitive: ").strip()
                    if industry_input.lower() == "list":
                        print("Available industries:")
                        for industry in industries:
                            print(f"  {industry}")
                        continue
                    industry_list = [ind.strip() for ind in industry_input.split(',') if ind.strip() in industries]
                    if not industry_list:
                        print("Invalid industries. Please try again.")
                    else:
                        break
                mask_industry = df["Industry"].isin(industry_list)
                applied_filters = [f for f in applied_filters if not f.startswith("Industry:")]
                applied_filters.append(f"Industry: {', '.join(industry_list)}")
                print(f"Industry filter set to: {', '.join(industry_list)}")

        overall_mask = mask_state & mask_subed & mask_amount & mask_equity & mask_industry

        filtered_df = df[overall_mask]
        if filtered_df.empty:
            print("\nNo data matches the current filter criteria.")
        else:
            print("\nHere is a sample of the filtered data:")
            print("Applied Filters- " + "; ".join(applied_filters))
            display(filtered_df.sample(n=min(10, len(filtered_df)))[["Central Index Key", "Name of the Entity", "Address", "Address - Specification", "City", "State or Country - Physical Location", "State or Country - Full - Physical Location", "Zip Code / Postal Code", "Issuer Phone Number", "State or Country - Legal Jurisdiction", "Minimum Investment Amount Accepted", "Total Amount Offered", "Total Amount Sold So Far", "Total Amount Remaining to be sold", "Clarifications Regarding the Offering and Sales amounts", "Is the Entity Offering Equity", "Industry"]])
        
        continue_filter = input("Would you like to apply/update another filter? (yes/no): ").strip().lower()
        while continue_filter not in ["yes", "no"]:
                continue_filter = input("Invalid input. Please enter 'yes' or 'no': ").strip().lower()
        if continue_filter != "yes":
            break

    print("\nDisplaying sample of the final filtered data:")
    return filtered_df.sample(n=min(10, len(filtered_df)))

versatile_filter(df)


In [None]:
filtered_df.columns = [col.lower() for col in filtered_df.columns]
def modify_columns(df):
    """
    Modify the columns of a DataFrame based on user input.
    This function allows the user to interactively add or remove columns from a DataFrame.
    It starts with a predefined set of default columns and provides options to modify this set.
    Parameters:
    df (pandas.DataFrame): The input DataFrame whose columns are to be modified.
    Returns:
    pandas.DataFrame: A DataFrame containing the selected columns.
    The function performs the following steps:
    1. Displays the DataFrame with the default columns.
    2. Asks the user if they want to add or remove columns.
    3. Based on user input, allows adding or removing columns from the default set.
    4. Displays the updated DataFrame after each modification.
    5. Returns the DataFrame with the final set of selected columns.
    """
    default_columns = [
        "central index key", "name of the entity", "address", "address - specification", "city",
        "state or country - physical location", "state or country - full - physical location", "zip code / postal code",
        "issuer phone number", "state or country - legal jurisdiction", "minimum investment amount accepted",
        "total amount offered", "total amount sold so far", "total amount remaining to be sold",
        "clarifications regarding the offering and sales amounts", "is the entity offering equity", "industry"
    ]
    all_columns = [
        "central index key", "name of the entity", "address", "address - specification", "city",
        "state or country - physical location", "state or country - full - physical location", "zip code / postal code",
        "issuer phone number", "state or country - legal jurisdiction", "previous names used by the issuer",
        "previous names from edgar", "type of the entity", "type of the entity - specification", "industry",
        "revenue range of the entity", "federal exemption or exclusions claimed by the entity",
        "is the entry an amendment to another filing?", "date of the first sale under this filing",
        "is the offering duration intended to be more than a year?", "is the entity offering equity",
        "is the entity creating or managing a pooled fund", "is the offering part of a business combination transaction?",
        "is the offering part of a business combination transaction? - clarification",
        "minimum investment amount accepted", "total amount offered", "total amount sold so far",
        "total amount remaining to be sold", "clarifications regarding the offering and sales amounts",
        "are non-accredited investors participating in the offering?",
        "number of investors who already invested under current offering", "commision amount paid for sales",
        "finder's fees amount paid", "clarification regarding the finder's fees amount paid",
        "amount of the raised capital already used for a purpose",
        "clarification regarding the amount of the raised capital already used for a purpose",
        "name of the entity representative signing the filing", "name of the issuer entity",
        "name of the person signing", "name of the person signing - affirmation", "title of the person signing",
        "date the form was signed"
    ]
    default_columns = [col for col in default_columns if col in df.columns]

    print("Current dataframe with default columns:")
    display(df[default_columns].head())

    while True:
        user_input = input("Would you like to add/remove some of the columns? (yes/no): ").strip().lower()
        if user_input in ["yes", "no"]:
            break
        else:
            print("Invalid input. Please enter 'yes' or 'no'.")

    if user_input == "yes":
        while True:
            action = input("Would you like to add or remove columns? (add/remove): ").strip().lower()
            if action in ["add", "remove"]:
                break
            else:
                print("Invalid input. Please enter 'add' or 'remove'.")
        
        while True:
            if action == "add":
                while True:
                    columns_to_add = input("What columns should be added? Type 'help' to display options, 'all' to add all columns, or 'back' to go back: ").strip().lower()
                    if columns_to_add == "help":
                        print("Available columns to add:")
                        for col in all_columns:
                            if col not in default_columns:
                                print(f"  {col}")
                        continue
                    elif columns_to_add == "all":
                        default_columns = [col for col in all_columns if col in df.columns]
                        print("All columns added.")
                        print("Updated dataframe preview:")
                        display(df[default_columns].head())
                        break
                    elif columns_to_add == "back":
                        break
                    columns_to_add = [col.strip() for col in columns_to_add.split(',') if col.strip() in all_columns and col.strip() not in default_columns]
                    if not columns_to_add:
                        print("Invalid columns. Please try again.")
                    else:
                        default_columns.extend(columns_to_add)
                        print(f"Columns added: {', '.join(columns_to_add)}")
                        print("Updated dataframe preview:")
                        display(df[default_columns].head())
                        break
            
            elif action == "remove":
                while True:
                    columns_to_remove = input("What columns should be removed? Type 'help' to display options or 'back' to go back: ").strip().lower()
                    if columns_to_remove == "help":
                        print("Available columns to remove:")
                        for col in default_columns:
                            print(f"  {col}")
                        continue
                    elif columns_to_remove == "back":
                        break
                    columns_to_remove = [col.strip() for col in columns_to_remove.split(',') if col.strip() in default_columns]
                    if not columns_to_remove:
                        print("Invalid columns. Please try again.")
                    else:
                        default_columns = [col for col in default_columns if col not in columns_to_remove]
                        print(f"Columns removed: {', '.join(columns_to_remove)}")
                        print("Updated dataframe preview:")
                        display(df[default_columns].head())
                        break

            while True:
                more_changes = input("Would you like to add/remove any other columns? (yes/no): ").strip().lower()
                if more_changes in ["yes", "no"]:
                    break
                else:
                    print("Invalid input. Please enter 'yes' or 'no'.")
            if more_changes == "no":
                break
            else:
                action = input("Would you like to add or remove columns? (add/remove): ").strip().lower()

    columns_df = df[default_columns]
    return columns_df

new_df = modify_columns(filtered_df)

In [None]:
# Parsing the XML file to extract the relevant information
from sec_edgar_downloader import Downloader
#needs to comply with sec privacy :(( )
dl = Downloader("test", "test@test.com")
import pandas as pd 
import time
import requests

#sec offers an api to get the daily or quartely indexes, metadata for all fillings
#there is no endpoint to specifically search by form type
#the actual data can only be downloaded with an accesion number and cik
#using cik fetched from daily index, fetch all fillings by company and select those with type D 
#download the actual file using url from the data line(the current data url returns the entire txt, we only need the actual xml file for the filing ) 


more_data = input("Are you interested in more data? (Y/N): ").strip().upper()

if more_data != 'Y':
    print("Exiting script.")
    import sys
    sys.exit()

year = int(input("Enter the year in which you are interested (between 2000 and 2013): "))

if year < 2000 or year > 2013:
    print("Invalid year. Exiting script.")
    import sys
    sys.exit()
quarter = int(input("Enter the quarter in which you are interested (e.g. 4). "))
#month = int(input("Enter the month of interest in numerical format (e.g. 11)."))
#day = int(input("Enter the day of interest in numerical format (e.g. 27)."))
#date = f"{year}{month}{day}"
#base_url = 'https://www.sec.gov/Archives/edgar/daily-index'
#index_url = f'{base_url}/{year}/QTR{quarter}/company.{date}.idx'

base_full_index_url ='https://www.sec.gov/Archives/edgar/full-index'
full_index_url =  f'{base_full_index_url}/{year}/QTR{quarter}/company.idx'

headers = {
    'User-Agent': 'Test (test@test.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov',
    'Connection': 'keep-alive',
}
#response = requests.get(index_url, headers=headers)
response = requests.get(full_index_url, headers=headers)

if response.status_code == 403:
    print("Access denied. Ensure you have a proper User-Agent header.")
    exit()

response.raise_for_status() 


content = response.text

lines = content.splitlines()


form_d_filings = []



header = ["Company Name", "Form Type", "CIK", "Date Filed", "File Name"]
#data lines skip 3 for daily index, 7 for full index 
#daily index returns idx file with all the fillings for the day, fixed width format, parse it into pandas 
records = [] 
#daily index formatting 
'''
data_lines= lines[3:]
for line in data_lines:
    if line.strip():  
        company_name = line[:60].strip()
        form_type = line[60:71].strip()
        cik = line[71:82].strip()
        date_filed = line[82:92].strip()
        file_name = line[92:].strip()
        records.append([company_name, form_type, cik, date_filed, file_name])

'''
#quarter index formatting 

data_lines=lines[8:]
for line in data_lines:
    if line.strip():  
        company_name = line[0:59].strip()
        form_type = line[59:72].strip()
        cik = line[72:87].strip()
        date_filed = line[87:102].strip()
        file_name = line[102:].strip()
        
        records.append([company_name, form_type, cik, date_filed, file_name])
        
df = pd.DataFrame(records, columns=header)


form_d_df = df[df["Form Type"] == "D"] 
row_dict = form_d_df.iloc[1].to_dict()  
for key, value in row_dict.items():

    form_d_df['Acession_number'] = form_d_df['File Name'].str.split('/').str[-1].str.replace('-', '').str.replace('.txt', '', regex=False)


base_url = "https://www.sec.gov/Archives/edgar/data/"
headers = {
    'User-Agent': 'Test (test@test.com)',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov',
    'Connection': 'keep-alive',
}

xml_data_list = []
for index, row in form_d_df.iterrows():
    cik = row['CIK']
    accession_number = row['Acession_number']
    url = f"{base_url}{cik}/{accession_number}/primary_doc.xml"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            xml_data_list.append(response.text)
        else:
            xml_data_list.append(f"Failed for {url} with status code {response.status_code}")
    except Exception as e:
        xml_data_list.append(f"Error for {url}: {str(e)}")


import xml.etree.ElementTree as ET
import plotly.express as px
import plotly.graph_objects as go
def xml_parse(root):
    def extract_text(element, path):
        found = element.find(path)
        return found.text if found is not None else None

    # Parse primary issuer data
    primary_issuer = root.find(".//primaryIssuer")
    if primary_issuer is None:
        return None
    data_primary_issuer = {
        "Central Index Key": extract_text(primary_issuer, ".//cik"),
        "Name of the Entity": extract_text(primary_issuer, ".//entityName"),
        "Address": extract_text(primary_issuer, ".//street1"),
        "Address - Specification": extract_text(primary_issuer, ".//street2"),
        "City": extract_text(primary_issuer, ".//city"),
        "State or Country - Physical Location": extract_text(primary_issuer, ".//stateOrCountry"),
        "State or Country - Full - Physical Location": extract_text(primary_issuer, ".//stateOrCountryDescription"),
        "Zip Code / Postal Code": extract_text(primary_issuer, ".//zipCode"),
        "Issuer Phone Number": extract_text(primary_issuer, ".//issuerPhoneNumber"),
        "State or Country - Legal Jurisdiction": extract_text(primary_issuer, ".//jurisdictionOfInc"),
        "Previous Names Used by the Issuer": extract_text(primary_issuer, ".//issuerPreviousNameList/value"),
        "Previous Names from EDGAR": extract_text(primary_issuer, ".//edgarPreviousNameList/value"),
        "Type of the Entity": extract_text(primary_issuer, ".//entityType"),
        "Type of the Entity - Specification": extract_text(primary_issuer, ".//entityTypeOtherDesc"),
    }

    related_persons = root.findall(".//relatedPersonsList/relatedPersonInfo")
    data_related_persons = {}
    for i, related_person in enumerate(related_persons, start=1):
        data_related_persons[f"First Name - Person {i}"] = extract_text(related_person, ".//relatedPersonName/firstName"),
        data_related_persons[f"Last Name - Person {i}"] = extract_text(related_person, ".//relatedPersonName/lastName"),
        data_related_persons[f"Associated Address - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/street1"),
        data_related_persons[f"Associated Address - Specification - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/street2"),
        data_related_persons[f"Associated City - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/city"),
        data_related_persons[f"Associated State or Country - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/stateOrCountry"),
        data_related_persons[f"Associated State or Country - Full - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/stateOrCountryDescription"),
        data_related_persons[f"Associated Zip Code - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/zipCode"),

        relationships = related_person.findall(".//relatedPersonRelationshipList/relationship")
        for j, relationship in enumerate(relationships, start=1):
            data_related_persons[f"Relationship {j} with the Entity - Person {i}"] = relationship.text if relationship is not None else None

        data_related_persons[f"Clarification of the relationship - Person {i}"] = extract_text(related_person, ".//relationshipClarification"),

    offering_data = root.find(".//offeringData")
    data_offering_data = {
        "Industry": extract_text(offering_data, ".//industryGroup/industryGroupType"),
        "Revenue Range of the Entity": extract_text(offering_data, ".//issuerSize/revenueRange"),
        "Federal Exemption or Exclusions Claimed by the Entity": ", ".join([item.text for item in offering_data.findall(".//federalExemptionsExclusions/item") if item is not None]) if offering_data is not None else None,
        "Is the Entry an Amendment to Another Filing?": extract_text(offering_data, ".//typeOfFiling/newOrAmendment/isAmendment"),
        "Date of the First Sale Under this Filing": extract_text(offering_data, ".//typeOfFiling/dateOfFirstSale/value"),
        "Is the Offering Duration Intended to be More Than a Year?": extract_text(offering_data, ".//durationOfOffering/moreThanOneYear"),
        "Is the Entity Offering Equity": extract_text(offering_data, ".//typesOfSecuritiesOffered/isEquityType"),
        "Is the Entity Creating or Managing a Pooled Fund": extract_text(offering_data, ".//typesOfSecuritiesOffered/isPooledInvestmentFundType"),
        "Is the Offering Part of a Business Combination Transaction?": extract_text(offering_data, ".//isPooledInvestmentFundType/isBusinessCombinationTransaction"),
        "Is the Offering Part of a Business Combination Transaction? - Clarification": extract_text(offering_data, ".//businessCombinationTransaction/clarificationOfResponse"),
        "Minimum Investment Amount Accepted": extract_text(offering_data, ".//minimumInvestmentAccepted"),
        #"Entities Compensated for Selling the Securities": extract_text(offering_data, ".//salesCompensationList"),
        # The line above is commented out because we don't see much use in it
        "Total Amount Offered": extract_text(offering_data, ".//offeringSalesAmounts/totalOfferingAmount"),
        "Total Amount Sold So Far": extract_text(offering_data, ".//offeringSalesAmounts/totalAmountSold"),
        "Total Amount Remaining to be sold": extract_text(offering_data, ".//offeringSalesAmounts/totalRemaining"),
        "Clarifications Regarding the Offering and Sales amounts": extract_text(offering_data, ".//offeringSalesAmounts/clarificationOfResponse"),
        "Are Non-accredited Investors Participating in the Offering?": extract_text(offering_data, ".//investors/hasNonAccreditedInvestors"),
        "Number of Investors Who Already Invested Under Current Offering": extract_text(offering_data, ".//investors/totalNumberAlreadyInvested"),
        "Commision Amount Paid for Sales": extract_text(offering_data, ".//salesCommissionsFindersFees/salesCommissions/dollarAmount"),
        "Finder's Fees Amount Paid": extract_text(offering_data, ".//salesCommissionsFindersFees/findersFees/dollarAmount"),
        "Clarification Regarding the Finder's Fees Amount Paid": extract_text(offering_data, ".//salesCommissionsFindersFees/clarificationOfResponse"),
        "Amount of the Raised Capital Already Used for a Purpose": extract_text(offering_data, ".//useOfProceeds/grossProceedsUsed/dollarAmount"),
        "Clarification Regarding the Amount of the Raised Capital Already Used for a Purpose": extract_text(offering_data, ".//useOfProceeds/clarificationOfResponse"),
        "Name of the Entity Representative Signing the Filing": extract_text(offering_data, ".//signatureBlock/authorizedRepresentative"),
        "Name of the Issuer Entity": extract_text(offering_data, ".//signatureBlock/signature/issuerName"),
        "Name of the Person Signing": extract_text(offering_data, ".//signatureBlock/signature/signatureName"),
        "Name of the Person Signing - Affirmation": extract_text(offering_data, ".//signatureBlock/signature/nameOfSigner"),
        "Title of the Person Signing": extract_text(offering_data, ".//signatureBlock/signature/signatureTitle"),
        "Date the Form Was Signed": extract_text(offering_data, ".//signatureBlock/signature/signatureDate")
    }
    # Combine all data
    data = {**data_primary_issuer, **data_related_persons, **data_offering_data}
    return data

data_fin = []

for information in xml_data_list:
    root = ET.fromstring(information)
    data_fin.append(xml_parse(root))

df_main = pd.DataFrame(data_fin)

df_main = df_main.fillna("N/A")
for column in df_main.columns:
    df_main[column] = df_main[column].apply(lambda x: x[0] if isinstance(x, tuple) else x)

df_main = df_main.replace("None", "N/A")

redundant_strings = ["/s/", "/bem/", "c/o", "n/a"]
for x in redundant_strings:
    df_main = df_main.replace(x, "", regex=True)

df_main = df_main.replace("", "N/A")
df_main = df_main.fillna("N/A")
df_main.to_csv("cleaned_data.csv", index=False)






In [None]:
import os
import sys
import zipfile
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
from io import StringIO
import plotly.express as px


def extract_local_zip_data():
    extract_path = "extracted_data"
    if not os.path.exists(extract_path):
        zip_path = "Data.zip"
        os.makedirs(extract_path, exist_ok=True)
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_path)
            print(f"Extracted local data from {zip_path} into {extract_path}.")
        except Exception as e:
            print(f"Error extracting {zip_path}: {e}")
            sys.exit(1)
    else:
        print(f"Data already extracted in {extract_path}.")

extract_local_zip_data()

def greet_user():
    print("Welcome to the SEC Filing D Analysis Tool!")
    print("This project retrieves SEC filing D data and equips the user to explore,")
    print("and analyze the start-up/private capital landscape in the given time period.")
    print("Let's get started.\n")

def get_date_input():
    current_year = datetime.now().year

    while True:
        try:
            sel_year = int(input("Enter the year in which you are interested (years 2014-2024 come imported with the tool, years outside of this range have to be downloaded which can take around 20 minutes): "))
            if sel_year < 2001 or sel_year > current_year:
                raise ValueError("Year must be between 2001 and the current year.")
            break
        except ValueError as e:
            print(f"Invalid input: {e}. Please try again.")
    
    while True:
        try:
            sel_quarter = int(input("Enter the quarter of interest (1-4): "))
            if sel_quarter < 1 or sel_quarter > 4:
                raise ValueError("Quarter must be between 1 and 4.")
            break
        except ValueError as e:
            print(f"Invalid input: {e}. Please try again.")
    
    return sel_year, sel_quarter

sel_year, sel_quarter = get_date_input()


def load_data(sel_year, sel_quarter):
    csv_file_name = f"{sel_year}_QTR{sel_quarter}.csv"
    file_path = os.path.join("extracted_data", csv_file_name)
    
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        return df
    else:
        while True:
            user_input = input("The requested date doesn't exist locally, do you want to download it? (Yes/No): ").strip().lower()
            if user_input in ['yes', 'no']:
                break
            else:
                print("Invalid input. Please enter 'Yes' or 'No'.")
        
        if user_input == 'no':
            print("Exiting the program as per user request.")
            import sys
            sys.exit(0)
        
        print("Let's download the data.")
        base_full_index_url = 'https://www.sec.gov/Archives/edgar/full-index'
        full_index_url = f'{base_full_index_url}/{sel_year}/QTR{sel_quarter}/company.idx'

        headers = {
            'User-Agent': 'Test (test@test.com)',
            'Accept-Encoding': 'gzip, deflate',
            'Host': 'www.sec.gov',
            'Connection': 'keep-alive',
        }
        response = requests.get(full_index_url, headers=headers)

        if response.status_code == 403:
            print("Access denied. Ensure you have a proper User-Agent header.")
            import sys
            sys.exit(0)

        response.raise_for_status()
        content = response.text
        lines = content.splitlines()

        header_cols = ["Company Name", "Form Type", "CIK", "Date Filed", "File Name"]
        records = []
        data_lines = lines[8:]
        for line in data_lines:
            if line.strip(): 
                company_name = line[0:59].strip()
                form_type = line[59:72].strip()
                cik = line[72:87].strip()
                date_filed = line[87:102].strip()
                file_name = line[102:].strip()
                records.append([company_name, form_type, cik, date_filed, file_name])
        df_fillings = pd.DataFrame(records, columns=header_cols)
        form_d_df = df_fillings[df_fillings["Form Type"] == "D"]
        
        form_d_df['Acession_number'] = form_d_df['File Name'].str.split('/').str[-1] \
            .str.replace('-', '').str.replace('.txt', '', regex=False)
        base_url = "https://www.sec.gov/Archives/edgar/data/"
        headers = {
            'User-Agent': 'Test (test@test.com)',
            'Accept-Encoding': 'gzip, deflate',
            'Host': 'www.sec.gov',
            'Connection': 'keep-alive',
        }
        xml_data_list = []
        for index, row in form_d_df.iterrows():
            cik = row['CIK']
            accession_number = row['Acession_number']
            url = f"{base_url}{cik}/{accession_number}/primary_doc.xml"
            try:
                resp = requests.get(url, headers=headers)
                if resp.status_code == 200:
                    xml_data_list.append(resp.text)
                else:
                    xml_data_list.append(f"Failed for {url} with status code {resp.status_code}")
            except Exception as e:
                xml_data_list.append(f"Error for {url}: {str(e)}")
        
        def xml_parse(root):
            def extract_text(element, path):
                found = element.find(path)
                return found.text if found is not None else None

            primary_issuer = root.find(".//primaryIssuer")
            if primary_issuer is None:
                return None
            data_primary_issuer = {
                "Central Index Key": extract_text(primary_issuer, ".//cik"),
                "Name of the Entity": extract_text(primary_issuer, ".//entityName"),
                "Address": extract_text(primary_issuer, ".//street1"),
                "Address - Specification": extract_text(primary_issuer, ".//street2"),
                "City": extract_text(primary_issuer, ".//city"),
                "State or Country - Physical Location": extract_text(primary_issuer, ".//stateOrCountry"),
                "State or Country - Full - Physical Location": extract_text(primary_issuer, ".//stateOrCountryDescription"),
                "Zip Code / Postal Code": extract_text(primary_issuer, ".//zipCode"),
                "Issuer Phone Number": extract_text(primary_issuer, ".//issuerPhoneNumber"),
                "State or Country - Legal Jurisdiction": extract_text(primary_issuer, ".//jurisdictionOfInc"),
                "Previous Names Used by the Issuer": extract_text(primary_issuer, ".//issuerPreviousNameList/value"),
                "Previous Names from EDGAR": extract_text(primary_issuer, ".//edgarPreviousNameList/value"),
                "Type of the Entity": extract_text(primary_issuer, ".//entityType"),
                "Type of the Entity - Specification": extract_text(primary_issuer, ".//entityTypeOtherDesc"),
            }
            related_persons = root.findall(".//relatedPersonsList/relatedPersonInfo")
            data_related_persons = {}
            for i, related_person in enumerate(related_persons, start=1):
                data_related_persons[f"First Name - Person {i}"] = extract_text(related_person, ".//relatedPersonName/firstName")
                data_related_persons[f"Last Name - Person {i}"] = extract_text(related_person, ".//relatedPersonName/lastName")
                data_related_persons[f"Associated Address - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/street1")
                data_related_persons[f"Associated Address - Specification - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/street2")
                data_related_persons[f"Associated City - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/city")
                data_related_persons[f"Associated State or Country - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/stateOrCountry")
                data_related_persons[f"Associated State or Country - Full - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/stateOrCountryDescription")
                data_related_persons[f"Associated Zip Code - Person {i}"] = extract_text(related_person, ".//relatedPersonAddress/zipCode")
                relationships = related_person.findall(".//relatedPersonRelationshipList/relationship")
                for j, relationship in enumerate(relationships, start=1):
                    data_related_persons[f"Relationship {j} with the Entity - Person {i}"] = relationship.text if relationship is not None else None
                data_related_persons[f"Clarification of the relationship - Person {i}"] = extract_text(related_person, ".//relationshipClarification")
            offering_data = root.find(".//offeringData")
            data_offering_data = {
                "Industry": extract_text(offering_data, ".//industryGroup/industryGroupType"),
                "Revenue Range of the Entity": extract_text(offering_data, ".//issuerSize/revenueRange"),
                "Federal Exemption or Exclusions Claimed by the Entity": ", ".join([item.text for item in offering_data.findall(".//federalExemptionsExclusions/item") if item is not None]) if offering_data is not None else None,
                "Is the Entry an Amendment to Another Filing?": extract_text(offering_data, ".//typeOfFiling/newOrAmendment/isAmendment"),
                "Date of the First Sale Under this Filing": extract_text(offering_data, ".//typeOfFiling/dateOfFirstSale/value"),
                "Is the Offering Duration Intended to be More Than a Year?": extract_text(offering_data, ".//durationOfOffering/moreThanOneYear"),
                "Is the Entity Offering Equity": extract_text(offering_data, ".//typesOfSecuritiesOffered/isEquityType"),
                "Is the Entity Creating or Managing a Pooled Fund": extract_text(offering_data, ".//typesOfSecuritiesOffered/isPooledInvestmentFundType"),
                "Is the Offering Part of a Business Combination Transaction?": extract_text(offering_data, ".//isPooledInvestmentFundType/isBusinessCombinationTransaction"),
                "Is the Offering Part of a Business Combination Transaction? - Clarification": extract_text(offering_data, ".//businessCombinationTransaction/clarificationOfResponse"),
                "Minimum Investment Amount Accepted": extract_text(offering_data, ".//minimumInvestmentAccepted"),
                "Total Amount Offered": extract_text(offering_data, ".//offeringSalesAmounts/totalOfferingAmount"),
                "Total Amount Sold So Far": extract_text(offering_data, ".//offeringSalesAmounts/totalAmountSold"),
                "Total Amount Remaining to be sold": extract_text(offering_data, ".//offeringSalesAmounts/totalRemaining"),
                "Clarifications Regarding the Offering and Sales amounts": extract_text(offering_data, ".//offeringSalesAmounts/clarificationOfResponse"),
                "Are Non-accredited Investors Participating in the Offering?": extract_text(offering_data, ".//investors/hasNonAccreditedInvestors"),
                "Number of Investors Who Already Invested Under Current Offering": extract_text(offering_data, ".//investors/totalNumberAlreadyInvested"),
                "Commision Amount Paid for Sales": extract_text(offering_data, ".//salesCompensationsFindersFees/salesCommissions/dollarAmount"),
                "Finder's Fees Amount Paid": extract_text(offering_data, ".//salesCompensationsFindersFees/findersFees/dollarAmount"),
                "Clarification Regarding the Finder's Fees Amount Paid": extract_text(offering_data, ".//salesCompensationsFindersFees/clarificationOfResponse"),
                "Amount of the Raised Capital Already Used for a Purpose": extract_text(offering_data, ".//useOfProceeds/grossProceedsUsed/dollarAmount"),
                "Clarification Regarding the Amount of the Raised Capital Already Used for a Purpose": extract_text(offering_data, ".//useOfProceeds/clarificationOfResponse"),
                "Name of the Entity Representative Signing the Filing": extract_text(offering_data, ".//signatureBlock/authorizedRepresentative"),
                "Name of the Issuer Entity": extract_text(offering_data, ".//signatureBlock/signature/issuerName"),
                "Name of the Person Signing": extract_text(offering_data, ".//signatureBlock/signature/signatureName"),
                "Name of the Person Signing - Affirmation": extract_text(offering_data, ".//signatureBlock/signature/nameOfSigner"),
                "Title of the Person Signing": extract_text(offering_data, ".//signatureBlock/signature/signatureTitle"),
                "Date the Form Was Signed": extract_text(offering_data, ".//signatureBlock/signature/signatureDate")
            }
            data = {**data_primary_issuer, **data_related_persons, **data_offering_data}
            return data


        parsed_data = []
        for information in xml_data_list:
            try:
                root = ET.fromstring(information)
                parsed = xml_parse(root)
                if parsed is not None:
                    parsed_data.append(parsed)
            except Exception as e:
                print("Error parsing XML:", e)
        df = pd.DataFrame(parsed_data)
        df = df.fillna("N/A")
        for column in df.columns:
            df[column] = df[column].apply(lambda x: x[0] if isinstance(x, tuple) else x)
        df = df.replace("None", "N/A")
        redundant_strings = ["/s/", "/bem/", "c/o", "n/a"]
        for x in redundant_strings:
            df = df.replace(x, "", regex=True)
        df = df.replace("", "N/A")
        df = df.fillna("N/A")
        
        df.to_csv(file_path, index=False)
        print(f"New data for quarter {sel_quarter}, year {sel_year} saved to {file_path}")
        return df
df = load_data(sel_year, sel_quarter)

