In [1]:
import pandas as pd
import polars as pl
from data import *
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import pyarrow 
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import os
import numpy as np
from scipy import stats
from typing import cast
from utils import *
import wrds

In [2]:
db = wrds.Connection(
    wrds_username='jayati'
)


Loading library list...
Done


In [3]:
full_data_imputed = pl.read_parquet("data/3_2_imputed_dataset.parquet")

In [4]:
full_data_imputed.height

5001719

In [7]:
crsp_monthly_query = """
SELECT msf.permno, msf.date, ssih.siccd
FROM crsp.msf AS msf
LEFT JOIN crsp.stksecurityinfohist AS ssih
ON msf.permno = ssih.permno
AND ssih.secinfostartdt <= msf.date AND msf.date <= ssih.secinfoenddt
WHERE msf.date BETWEEN '1925-12-01' AND '2023-12-01'
"""
crsp_monthly_data = db.raw_sql(crsp_monthly_query)

crsp_monthly_df = pl.from_pandas(crsp_monthly_data)

crsp_monthly_df.select(['permno', 'date', 'siccd']).write_csv('data/crsp_monthly_with_industries.csv')


In [8]:
full_data_imputed.head()

permno,yyyymm,date,date_right,AM,Accruals,AnnouncementReturn,AssetGrowth,BM,BMdec,BPEBM,Beta,BetaFP,BetaLiquidityPS,BidAskSpread,BookLeverage,CF,CashProd,ChEQ,ChInv,ChInvIA,ChNNCOA,ChNWC,ChTax,ConvDebt,CoskewACX,Coskewness,DebtIssuance,DelCOA,DelCOL,DelEqu,DelFINL,DelLTI,DelNetFin,DivInit,DivOmit,DolVol,…,NOA,NetDebtFinance,NetEquityFinance,NumEarnIncrease,OPLeverage,PctAcc,PriceDelayRsq,PriceDelaySlope,PriceDelayTstat,RDIPO,RealizedVol,ResidualMomentum,ReturnSkew,ReturnSkew3F,RoE,SP,ShareIss1Y,ShareIss5Y,ShareRepurchase,Spinoff,Tax,TotalAccruals,VolMkt,VolSD,VolumeTrend,XFIN,betaVIX,cfp,dNoa,hire,zerotrade,zerotradeAlt1,zerotradeAlt12,STreversal,Price,Size,ret
i64,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
10000,198601,"""1986-01-01""","""1986-01-31""",16.202734,0.001839,-0.035723,-0.083313,0.21025,1.404658,0.225389,1.025208,0.680467,0.121024,0.019758,-2.326463,0.155227,26.920122,-1.14218,-0.009633,0.135743,-0.005331,-0.00193,-0.00254,-0.1,0.031704,0.208924,-0.7,-0.025725,-0.004444,-0.069119,-0.026367,-0.0165,-0.021856,0.0,0.0,2.203245,…,-0.618638,-0.010772,-0.007077,0.0,0.404598,0.178823,0.843958,-2.272196,-0.249299,0.0,-0.065278,-0.026406,-2.23832,-2.103353,0.059981,2.454645,-0.040535,-0.148483,0.3,0.0,0.760982,-0.017584,-0.027994,-1.680641,0.023985,-0.03372,-0.005234,0.069862,-0.047715,0.0,6.591505,6.951966,5.942311,0.0,-1.475907,-9.686575,0.707317
10000,198602,"""1986-02-01""","""1986-02-28""",6.419575,0.023401,-0.024668,-0.053147,-0.578258,1.615003,-0.074305,1.575849,0.897772,0.237806,0.026047,-5.2407,-0.029837,59.399372,-1.196283,0.027234,1.103777,-0.010218,0.003149,-0.019136,-0.1,0.138626,0.213954,-0.7,-0.024757,-0.003946,-0.047217,-0.007605,-0.019091,-0.000286,0.0,0.0,0.619522,…,-0.551703,-0.010474,-0.02755,0.0,0.408773,1.44153,0.874593,-0.377262,0.373869,0.0,-0.031004,0.240515,0.729555,0.810187,-0.269592,2.172496,-0.35988,-1.540913,0.2,0.0,0.9,-0.000392,-0.080506,-0.652343,-0.016094,-0.046509,-0.003488,0.16898,-0.063516,0.0,0.69714,4.7852e-08,0.967149,25.7143,-1.178655,-9.389323,-0.257143
10000,198603,"""1986-03-01""","""1986-03-31""",9.672034,0.01986,0.028711,-0.298639,-0.365597,0.83168,-0.471459,1.121976,0.7307,0.379325,0.017515,-3.163787,0.103764,40.948347,-1.24629,0.00244,1.388678,0.011832,-0.005521,-0.002551,0.0,0.065951,0.13381,-0.5,-0.025243,-0.018863,-0.069383,-0.035775,-0.016708,-0.04417,0.0,0.0,0.255134,…,-0.504815,-0.021901,-0.086281,0.0,0.166429,3.065052,0.832632,5.132098,3.2198343,0.0,-0.044548,0.30884,-1.6879,-1.038325,-0.012496,1.240049,-0.142092,-1.21123,0.2,0.0,0.82922,-0.004646,-0.048295,-0.203063,-0.010171,-0.168674,-0.002715,0.256035,-0.089197,0.0,0.522273,1.0234e-07,1.036358,-36.5385,-1.490091,-9.700759,0.365385
10000,198604,"""1986-04-01""","""1986-04-30""",34.015608,0.005994,-0.02554,-0.086369,0.401779,3.266376,-2.110798,1.148184,0.708417,-0.03177,0.01237,-2.37108,0.264075,17.657581,-1.178537,-0.005092,0.852608,-0.004871,-0.00011,-0.006269,0.0,0.149604,0.326355,-0.5,-0.029283,-0.006735,-0.070258,-0.030067,-0.017173,-0.035524,0.0,0.0,1.3126723,…,-0.6425,-0.023357,0.000215,0.0,0.413942,0.188181,0.811953,-2.28961,-0.412962,0.0,-0.011246,0.097471,0.276167,0.068097,0.144804,6.422166,0.088618,-0.37541,0.2,0.0,0.853708,-0.015898,-0.039136,-2.814366,0.00728,-0.011929,0.000877,0.084696,-0.051597,0.0,2.197636,7.4675e-08,2.679387,9.8592,-1.386294,-9.627207,-0.098592
10000,198605,"""1986-05-01""","""1986-05-30""",8.585498,0.012495,-0.045491,-0.016131,-0.351601,2.188701,-0.689932,1.625661,1.082514,0.110916,0.023352,-4.313698,0.00277,31.042189,-1.03832,0.030501,-0.817816,-0.013207,-0.006578,-0.004206,-0.1,0.201706,0.488893,-0.7,-0.017447,0.000681,-0.038604,-0.020003,-0.003672,-0.01622,0.0,0.0,0.737386,…,-0.457639,-0.020878,-0.007652,0.0,0.269973,4.040911,0.829054,1.22417,1.401425,0.0,-0.038863,0.135452,-1.289032,-0.738724,-0.033431,7.921766,-0.005041,0.052538,0.0,0.0,0.810389,-0.033585,-0.069511,-0.230249,-0.002305,-0.030054,0.008818,0.363375,-0.053943,0.0,0.515208,7.6496e-08,0.367705,22.2656,-1.134423,-9.375336,-0.222656


In [9]:
crsp_monthly_df.head()

permno,date,siccd
i64,date,f64
10000,1985-12-31,
10000,1986-01-31,3990.0
10000,1986-02-28,3990.0
10000,1986-03-31,3990.0
10000,1986-04-30,3990.0


In [10]:
crsp_monthly_df.height

5027863

In [11]:
full_data_imputed = full_data_imputed.with_columns([
    pl.col('permno').cast(pl.Int64),
    pl.col('date').cast(pl.Date)
])

crsp_monthly_df = crsp_monthly_df.with_columns([
    pl.col('permno').cast(pl.Int64),
    pl.col('date').cast(pl.Date)
])

# Extract year and month from the date columns
full_data_imputed = full_data_imputed.with_columns([
    pl.col('date').dt.year().alias('year'),
    pl.col('date').dt.month().alias('month')
])

crsp_monthly_df = crsp_monthly_df.with_columns([
    pl.col('date').dt.year().alias('year'),
    pl.col('date').dt.month().alias('month')
])

# Perform the join operation on permno, year, and month
full_data_imputed = full_data_imputed.join(
    crsp_monthly_df.select(['permno', 'year', 'month', 'siccd']),
    on=['permno', 'year', 'month'],
    how='left'
)

print(full_data_imputed.head())

shape: (5, 103)
┌────────┬────────┬────────────┬────────────┬───┬───────────┬──────┬───────┬────────┐
│ permno ┆ yyyymm ┆ date       ┆ date_right ┆ … ┆ ret       ┆ year ┆ month ┆ siccd  │
│ ---    ┆ ---    ┆ ---        ┆ ---        ┆   ┆ ---       ┆ ---  ┆ ---   ┆ ---    │
│ i64    ┆ i64    ┆ date       ┆ str        ┆   ┆ f64       ┆ i32  ┆ i8    ┆ f64    │
╞════════╪════════╪════════════╪════════════╪═══╪═══════════╪══════╪═══════╪════════╡
│ 10000  ┆ 198601 ┆ 1986-01-01 ┆ 1986-01-31 ┆ … ┆ 0.707317  ┆ 1986 ┆ 1     ┆ 3990.0 │
│ 10000  ┆ 198602 ┆ 1986-02-01 ┆ 1986-02-28 ┆ … ┆ -0.257143 ┆ 1986 ┆ 2     ┆ 3990.0 │
│ 10000  ┆ 198603 ┆ 1986-03-01 ┆ 1986-03-31 ┆ … ┆ 0.365385  ┆ 1986 ┆ 3     ┆ 3990.0 │
│ 10000  ┆ 198604 ┆ 1986-04-01 ┆ 1986-04-30 ┆ … ┆ -0.098592 ┆ 1986 ┆ 4     ┆ 3990.0 │
│ 10000  ┆ 198605 ┆ 1986-05-01 ┆ 1986-05-30 ┆ … ┆ -0.222656 ┆ 1986 ┆ 5     ┆ 3990.0 │
└────────┴────────┴────────────┴────────────┴───┴───────────┴──────┴───────┴────────┘


In [12]:
sic_groups = [
    ('Agriculture, Forestry, and Fishing', range(100, 1000)),
    ('Mining', range(1000, 1500)),
    ('Construction', range(1500, 1800)),
    ('Manufacturing', range(2000, 4000)),
    ('Transportation, Communications, Electric, Gas, and Sanitary Services', range(4000, 5000)),
    ('Wholesale Trade', range(5000, 5200)),
    ('Retail Trade', range(5200, 6000)),
    ('Finance, Insurance, and Real Estate', range(6000, 6800)),
    ('Services', range(7000, 9000)),
    ('Public Administration', range(9000, 10000))
]

# Flatten the SIC groups into a dictionary for fast lookup
sic_dict = {}
for industry, sic_range in sic_groups:
    for sic in sic_range:
        sic_dict[sic] = industry

# Create a Polars expression to map the SIC codes to industries
def map_sic_to_industry(sic):
    return sic_dict.get(sic, 'Unknown')

# Apply the mapping and count the number of stocks in each industry group
industry_counts = (full_data_imputed
                   .with_columns(pl.col('siccd').map_elements(map_sic_to_industry, return_dtype=pl.Utf8).alias('industry'))
                   .group_by('industry')
                   .agg(pl.count('permno').alias('count')))

print(industry_counts)


shape: (12, 2)
┌─────────────────────────────────┬─────────┐
│ industry                        ┆ count   │
│ ---                             ┆ ---     │
│ str                             ┆ u32     │
╞═════════════════════════════════╪═════════╡
│ Services                        ┆ 512483  │
│ Retail Trade                    ┆ 250349  │
│ Mining                          ┆ 261277  │
│ Agriculture, Forestry, and Fis… ┆ 13619   │
│ Finance, Insurance, and Real E… ┆ 1348191 │
│ …                               ┆ …       │
│ Transportation, Communications… ┆ 386362  │
│ Construction                    ┆ 43816   │
│ Public Administration           ┆ 165450  │
│ Wholesale Trade                 ┆ 136772  │
│ Manufacturing                   ┆ 1745782 │
└─────────────────────────────────┴─────────┘


In [18]:
full_data_industries = full_data_imputed.with_columns(
    pl.col('siccd').map_elements(map_sic_to_industry).alias('industry')
)

  full_data_industries = full_data_imputed.with_columns(


In [19]:
full_data_imputed.height

5001719

In [24]:
# Ensure the date column is of the correct type
full_data_reduced = full_data_industries.with_columns([
    pl.col('date').cast(pl.Date)
])

# Filter out rows with dates before 1990
full_data_reduced = full_data_reduced.filter(pl.col('date').dt.year() >= 2000)


print(full_data_reduced.head())

shape: (5, 104)
┌────────┬────────┬────────────┬────────────┬───┬──────┬───────┬────────┬─────────────────┐
│ permno ┆ yyyymm ┆ date       ┆ date_right ┆ … ┆ year ┆ month ┆ siccd  ┆ industry        │
│ ---    ┆ ---    ┆ ---        ┆ ---        ┆   ┆ ---  ┆ ---   ┆ ---    ┆ ---             │
│ i64    ┆ i64    ┆ date       ┆ str        ┆   ┆ i32  ┆ i8    ┆ f64    ┆ str             │
╞════════╪════════╪════════════╪════════════╪═══╪══════╪═══════╪════════╪═════════════════╡
│ 10001  ┆ 200001 ┆ 2000-01-01 ┆ 2000-01-31 ┆ … ┆ 2000 ┆ 1     ┆ 4920.0 ┆ Transportation, │
│        ┆        ┆            ┆            ┆   ┆      ┆       ┆        ┆ Communications… │
│ 10001  ┆ 200002 ┆ 2000-02-01 ┆ 2000-02-29 ┆ … ┆ 2000 ┆ 2     ┆ 4920.0 ┆ Transportation, │
│        ┆        ┆            ┆            ┆   ┆      ┆       ┆        ┆ Communications… │
│ 10001  ┆ 200003 ┆ 2000-03-01 ┆ 2000-03-31 ┆ … ┆ 2000 ┆ 3     ┆ 4920.0 ┆ Transportation, │
│        ┆        ┆            ┆            ┆   ┆      ┆       ┆

In [27]:
full_data_reduced.drop("year", "month")

permno,yyyymm,date,date_right,AM,Accruals,AnnouncementReturn,AssetGrowth,BM,BMdec,BPEBM,Beta,BetaFP,BetaLiquidityPS,BidAskSpread,BookLeverage,CF,CashProd,ChEQ,ChInv,ChInvIA,ChNNCOA,ChNWC,ChTax,ConvDebt,CoskewACX,Coskewness,DebtIssuance,DelCOA,DelCOL,DelEqu,DelFINL,DelLTI,DelNetFin,DivInit,DivOmit,DolVol,…,NetEquityFinance,NumEarnIncrease,OPLeverage,PctAcc,PriceDelayRsq,PriceDelaySlope,PriceDelayTstat,RDIPO,RealizedVol,ResidualMomentum,ReturnSkew,ReturnSkew3F,RoE,SP,ShareIss1Y,ShareIss5Y,ShareRepurchase,Spinoff,Tax,TotalAccruals,VolMkt,VolSD,VolumeTrend,XFIN,betaVIX,cfp,dNoa,hire,zerotrade,zerotradeAlt1,zerotradeAlt12,STreversal,Price,Size,ret,siccd,industry
i64,i64,date,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
10001,200001,2000-01-01,"""2000-01-31""",2.1733375,0.108981,-0.033873,-0.019392,-0.445696,0.752592,-1.907572,0.103002,0.091242,0.016507,0.031007,-2.466956,0.164873,103.34845,-1.05628,0.074093,0.6538,0.014184,0.072073,0.000448,0.0,0.123057,0.073287,0.0,0.024853,-0.044573,-0.016826,0.043476,0.000187,0.043289,0.0,0.0,0.938377,…,0.02357,2.0,1.1216513,2.953371,0.772187,-4.625865,3.7699442,0.0,-0.025284,-0.293568,-0.049701,-0.007985,0.117278,2.6856389,-0.019559,-0.101619,0.0,0.0,0.478891,-0.009637,-0.023017,-0.025291,-0.013213,0.066246,-0.000971,0.315177,0.030867,-0.048,2.7886364,3.818182,3.1109962,4.4118,-2.094946,-9.898789,-0.044118,4920.0,"""Transportation, Communications…"
10001,200002,2000-02-01,"""2000-02-29""",2.1404083,0.108981,0.013153,-0.019392,-0.445696,0.752592,-1.778563,0.104569,0.086462,0.017932,0.017571,-2.466956,0.162375,101.99336,-1.05628,0.074093,0.648363,0.014184,0.072073,0.000448,0.0,0.07828,0.085667,0.0,0.024853,-0.044573,-0.016826,0.043476,0.000187,0.043289,0.0,0.0,1.2864491,…,0.02357,2.0,1.1216513,2.953371,0.772187,-4.625865,3.7699442,0.0,-0.012606,-0.18632,-0.645254,-0.593679,0.117278,2.6449475,-0.019559,-0.101619,0.0,0.0,0.478891,-0.009637,-0.022175,-0.024616,-0.011518,0.066246,-0.00774,0.310402,0.030867,-0.048,3.1863637,1.0500002,2.8300753,-1.5385,-2.110213,-9.914057,0.015385,4920.0,"""Transportation, Communications…"
10001,200003,2000-03-01,"""2000-03-31""",2.1947544,0.108981,0.013153,-0.019392,-0.445696,0.752592,-2.003696,0.107111,0.087634,0.009077,0.024509,-2.466956,0.166498,104.20796,-1.05628,0.074093,0.652208,0.014184,0.072073,-0.003487,0.0,0.089577,0.089791,0.0,0.024853,-0.044573,-0.016826,0.043476,0.000187,0.043289,0.0,0.0,1.1164581,…,0.02357,0.0,1.1216513,2.953371,0.772187,-4.625865,3.7699442,0.0,-0.026537,-0.120605,-0.232345,-0.22865,0.117278,2.7121043,-0.017865,-0.098039,0.0,0.0,0.478891,-0.009637,-0.02242,-0.024579,-0.010919,0.066246,-0.006041,0.318283,0.030867,-0.048,3.5994072,5.2500005,2.8070488,1.5758,-2.079442,-9.888983,-0.015288,4920.0,"""Transportation, Communications…"
10001,200004,2000-04-01,"""2000-04-28""",2.1693325,0.108981,0.013153,-0.019392,-0.445696,0.752592,-1.890749,0.096826,0.07906,-0.008956,0.024083,-2.466956,0.164569,103.18584,-1.05628,0.074093,0.665905,0.014184,0.072073,-0.003487,0.0,0.248881,0.060906,0.0,0.024853,-0.044573,-0.016826,0.043476,0.000187,0.043289,0.0,0.0,1.6974498,…,0.02357,0.0,1.1216513,2.953371,0.772187,-4.625865,3.7699442,0.0,-0.022609,-0.216381,-0.28195,-0.408123,0.117278,2.6806898,-0.017865,-0.098039,0.0,0.0,0.478891,-0.009637,-0.019742,-0.024804,-0.009217,0.066246,0.003357,0.314597,0.030867,-0.048,3.8362494,5.478261,3.0353098,-1.1719,-2.091092,-9.900633,0.011719,4920.0,"""Transportation, Communications…"
10001,200005,2000-05-01,"""2000-05-31""",2.2207792,0.108981,0.067251,-0.019392,-0.445696,0.752592,-2.136743,0.105629,0.068718,-0.011836,0.028469,-2.466956,0.168472,105.23009,-1.05628,0.074093,0.668498,0.014184,0.072073,-0.003487,0.0,0.272532,0.03478,0.0,0.024853,-0.044573,-0.016826,0.043476,0.000187,0.043289,0.0,0.0,0.54749,…,0.02357,0.0,1.1216513,2.953371,0.772187,-4.625865,3.7699442,0.0,-0.016365,-0.086257,-0.411569,-0.228533,0.117278,2.7442634,-0.017865,-0.098039,0.0,0.0,0.478891,-0.009637,-0.018795,-0.025181,-0.00742,0.066246,-0.002448,0.322057,0.030867,-0.048,5.2453403,4.4210529,3.3203974,2.3166,-2.067654,-9.877195,-0.023166,4920.0,"""Transportation, Communications…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
93436,202308,2023-08-01,"""2023-08-31""",3.018393,0.016862,-0.011255,-0.325232,-0.23777,15.468266,4.57964,0.945858,0.937161,0.252522,0.008235,-1.838476,0.198568,73.605676,-1.480804,-0.098042,-0.35115,0.097266,-0.063707,-0.001368,0.0,0.14591,0.147393,-0.5,-0.129993,-0.096547,-0.200943,0.043262,-0.019852,0.142716,0.4,0.0,-2.02351,…,-0.007489,0.9,0.775863,0.172666,0.886973,0.042267,0.994829,0.0,-0.0219,0.081985,0.120696,-0.035047,0.28087,2.388049,-5.028949,-48.825246,0.0,0.1,0.288081,-0.223286,-0.249337,-8.234653,-0.001069,0.040382,0.000198,0.209852,-0.109511,-0.251513,7.378857,5.330302,6.284016,3.4962,-5.55327,-20.523771,-0.034962,9999.0,"""Public Administration"""
93436,202309,2023-09-01,"""2023-09-29""",2.761402,0.016862,-0.013155,-0.325232,-0.22747,5.53513,1.512429,1.039226,0.795781,0.391505,0.010096,-1.838476,0.194366,51.069172,-1.480804,-0.098042,-0.46235,0.097266,-0.063707,0.001899,0.0,0.153057,0.188822,-0.5,-0.129993,-0.096547,-0.200943,0.043262,-0.019852,0.142716,0.2,0.0,-1.88502,…,-0.007489,0.8,0.775863,0.172666,0.889389,0.264247,1.075372,0.0,-0.021682,0.229201,-0.049516,-0.116717,0.28087,2.457718,-5.409777,-77.23009,0.0,0.1,0.288081,-0.223286,-0.212493,-7.724635,-0.013673,0.040382,0.00046,0.22744,-0.109511,-0.251513,6.029153,5.273759,5.027825,3.0456,-5.522341,-20.494418,-0.030456,9999.0,"""Public Administration"""
93436,202310,2023-10-01,"""2023-10-31""",3.311023,0.016862,-0.017331,-0.325232,-0.929634,1.40785,0.411488,1.401577,0.852444,0.247096,0.01109,-1.838476,0.184978,34.67259,-1.480804,-0.098042,0.009358,0.097266,-0.063707,0.001899,0.0,0.034526,0.080106,-0.5,-0.129993,-0.096547,-0.200943,0.043262,-0.019852,0.142716,0.2,0.0,-2.915823,…,-0.007489,1.2,0.775863,0.172666,0.663017,1.767951,2.640223,0.0,-0.040466,0.090539,0.345609,0.031017,0.28087,2.806346,-4.899305,-13.637891,0.0,0.0,0.288081,-0.223286,-0.509929,-58.974584,-0.012511,0.040382,-0.003676,0.089645,-0.109511,-0.251513,0.705781,0.2,0.698125,19.7346,-5.302509,-20.274561,-0.197346,9999.0,"""Public Administration"""
93436,202311,2023-11-01,"""2023-11-30""",2.891101,0.016862,0.043047,-0.325232,-0.470768,5.577071,0.493668,1.283723,1.522885,0.07037,0.024136,-1.838476,0.222555,21.811001,-1.480804,-0.098042,0.225128,0.097266,-0.063707,0.001899,0.0,0.12171,0.321142,-0.6,-0.129993,-0.096547,-0.200943,0.043262,-0.019852,0.142716,0.2,0.0,-2.935834,…,-0.007489,2.2,0.775863,0.172666,0.736757,2.37049,2.091708,0.0,-0.057991,0.375082,-0.121852,-0.152184,0.28087,2.460464,-4.791239,-83.272601,0.0,0.0,0.288081,-0.223286,-0.130286,-5.426617,-0.017593,0.040382,-0.002853,0.299735,-0.109511,-0.251513,2.502209,2.301306,3.123486,-19.5379,-5.480972,-20.453025,0.195379,9999.0,"""Public Administration"""


In [28]:
full_data_reduced.write_csv('data/Industries_all.csv')

In [29]:
industries = full_data_reduced.select(pl.col('industry')).unique().to_series().to_list()

os.makedirs('data/industry_datasets', exist_ok=True)

for industry in industries:
    if industry is not None:
        industry_data = full_data_reduced.filter(pl.col('industry') == industry)
        industry_filename = f"data/industry_datasets/{industry.replace(' ', '_')}_data.csv"
        industry_data.write_csv(industry_filename)
        print(f"Data for {industry} saved to {industry_filename}")


Data for Wholesale Trade saved to data/industry_datasets/Wholesale_Trade_data.csv
Data for Manufacturing saved to data/industry_datasets/Manufacturing_data.csv
Data for Construction saved to data/industry_datasets/Construction_data.csv
Data for Unknown saved to data/industry_datasets/Unknown_data.csv
Data for Public Administration saved to data/industry_datasets/Public_Administration_data.csv
Data for Retail Trade saved to data/industry_datasets/Retail_Trade_data.csv
Data for Agriculture, Forestry, and Fishing saved to data/industry_datasets/Agriculture,_Forestry,_and_Fishing_data.csv
Data for Services saved to data/industry_datasets/Services_data.csv
Data for Transportation, Communications, Electric, Gas, and Sanitary Services saved to data/industry_datasets/Transportation,_Communications,_Electric,_Gas,_and_Sanitary_Services_data.csv
Data for Finance, Insurance, and Real Estate saved to data/industry_datasets/Finance,_Insurance,_and_Real_Estate_data.csv
Data for Mining saved to data/