In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import (r2_score)

import warnings
warnings.filterwarnings("ignore")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Fucntions 

In [2]:
def basic_info(df):

    """
    Takes a DataFrame as input, and gives the basic info like shape, missing values count, duplicated rows, unique values and dtypes of the features
    
    Args:
        df (pandas.DataFrame): The DataFrame for which you want the details of

    Returns: 
        None 
    """
    print(f"shape of the date : \n\trows = {df.shape[0]}, columns = {df.shape[1]}\n")
    missing_val_count = df.isna().sum().sum()
    print(f"missing values: \n\tcount = {missing_val_count}")
    if missing_val_count != 0:
        missing_data = df.isna().sum().reset_index().rename({"index" : "feature", 0 : "missing_val_count"}, axis = 1)
        missing_data =  missing_data[missing_data.missing_val_count > 0]
        missing_data["missing_val_percentage"] = np.round((missing_data["missing_val_count"] / df.shape[0]) * 100, 2)
        missing_data = missing_data.sort_values(by = "missing_val_count", ascending = False)
        display(missing_data)

    print(f"duplicated records: \n\tcount = {df.duplicated().sum()}\n")
    print(f"Unique Values : ")
    nunique_vals = df.nunique().reset_index().rename({"index" : "feature", 0 : "nunique_vals"}, axis = 1)
    display(nunique_vals)

    display(df.dtypes.reset_index().rename(columns = {"index" : "fetaure", 0 : "data type"}))

    return missing_data

In [3]:
def num_fillna(df, given_info, feature_median, col):
    required_data = given_info[(given_info.industry == df.industry) & (given_info.sector == df.sector)][col]
    if required_data.shape[0] == 0:
      required_data = given_info[(given_info.industry == df.industry)][col]
      if required_data.shape[0] > 0:
         return required_data.iloc[0]
      required_data = given_info[(given_info.sector == df.sector)][col]
      if required_data.shape[0] > 0:
         return required_data.iloc[0]
      else:
         return feature_median
    return required_data.iloc[0]   

## reading data and finding basic info

In [5]:
train = pd.read_csv("A:\\project\\Synnax Technologies\\train.csv")
test = pd.read_csv("A:\\project\\Synnax Technologies\\test.csv")

train["is_train"] = True
test["is_train"] = False

df = pd.concat([train, test], axis = 0, ignore_index = True)

In [6]:
train.head()

Unnamed: 0,Id,industry,sector,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,...,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end,is_train
0,196,Personal Services,Consumer Cyclical,1174.0,5.0,10.0,9.0,4.0,8.0,12.549223,...,,23171000.0,54319000.0,40732000.0,2150000.0,5946000.0,17225000.0,15075000.0,0.0,True
1,1568,Building Products & Equipment,Industrials,3600.0,4.0,4.0,3.0,4.0,3.0,1222.0,...,-71929000.0,410574000.0,692688000.0,1103262000.0,-39287000.0,333229000.0,77345000.0,116632000.0,0.0,True
2,1218,,Unknown,,,,,,,,...,603000000.0,1649000000.0,4587000000.0,6236000000.0,663000000.0,214000000.0,1435000000.0,772000000.0,0.0,True
3,23,Scientific & Technical Instruments,Technology,143.0,,,,,,,...,-9715.0,0.0,,,-9683.0,9683.0,-9683.0,0.0,0.0,True
4,783,Drug Manufacturers - Specialty & Generic,Healthcare,36.0,,,,,,1.231544,...,-4676000.0,12050000.0,0.0,12050000.0,-1594000.0,13644000.0,-1594000.0,0.0,0.0,True


In [7]:
test.head()

Unnamed: 0,Id,industry,sector,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,...,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end,is_train
0,480,Drug Manufacturers - Specialty & Generic,Healthcare,,,,,,,,...,,,,,,,,,0.0,False
1,573,Drug Manufacturers - Specialty & Generic,Healthcare,7600.0,6.0,7.0,6.0,4.0,5.0,90.5,...,,191562000.0,301543000.0,493105000.0,45907000.0,90726000.0,100836000.0,54929000.0,0.0,False
2,1967,Software - Application,Technology,,,,,,,,...,,,,,,,,,0.0,False
3,57,Marine Shipping,Industrials,5200.0,1.0,5.0,4.0,5.0,3.0,23.500002,...,-3375000.0,72793000.0,228158000.0,300951000.0,3164000.0,69629000.0,3164000.0,0.0,0.0,False
4,131,Metal Fabrication,Industrials,6700.0,1.0,4.0,4.0,6.0,3.0,20.748793,...,-7900000.0,85800000.0,606700000.0,692500000.0,31800000.0,54000000.0,31800000.0,0.0,0.0,False


In [8]:
missing_vals_train = basic_info(train)

shape of the date : 
	rows = 1624, columns = 213

missing values: 
	count = 19283


Unnamed: 0,feature,missing_val_count,missing_val_percentage
9,trailingPE,902,55.54
4,auditRisk,493,30.36
5,boardRisk,493,30.36
6,compensationRisk,493,30.36
7,shareHolderRightsRisk,493,30.36
...,...,...,...
59,Q2_TOTAL_ASSETS,1,0.06
65,Q2_TOTAL_LIABILITIES_AND_EQUITY,1,0.06
66,Q2_TOTAL_STOCKHOLDERS_EQUITY,1,0.06
96,Q4_TOTAL_LIABILITIES,1,0.06


duplicated records: 
	count = 0

Unique Values : 


Unnamed: 0,feature,nunique_vals
0,Id,1624
1,industry,113
2,sector,10
3,fullTimeEmployees,971
4,auditRisk,10
...,...,...
208,Q10_OPERATING_EXPENSES,1402
209,Q10_EBITDA,1398
210,Q10_DEPRECIATION_AND_AMORTIZATION,559
211,Q10_fiscal_year_end,2


Unnamed: 0,fetaure,data type
0,Id,int64
1,industry,object
2,sector,object
3,fullTimeEmployees,float64
4,auditRisk,float64
...,...,...
208,Q10_OPERATING_EXPENSES,float64
209,Q10_EBITDA,float64
210,Q10_DEPRECIATION_AND_AMORTIZATION,float64
211,Q10_fiscal_year_end,float64


In [9]:
missing_vals_test = basic_info(test)

shape of the date : 
	rows = 406, columns = 204

missing values: 
	count = 5791


Unnamed: 0,feature,missing_val_count,missing_val_percentage
9,trailingPE,217,53.45
4,auditRisk,132,32.51
5,boardRisk,132,32.51
6,compensationRisk,132,32.51
7,shareHolderRightsRisk,132,32.51
...,...,...,...
107,Q5_TOTAL_LIABILITIES_AND_EQUITY,1,0.25
91,Q4_TOTAL_STOCKHOLDERS_EQUITY,1,0.25
74,Q3_TOTAL_STOCKHOLDERS_EQUITY,1,0.25
125,Q6_TOTAL_STOCKHOLDERS_EQUITY,1,0.25


duplicated records: 
	count = 0

Unique Values : 


Unnamed: 0,feature,nunique_vals
0,Id,406
1,industry,88
2,sector,10
3,fullTimeEmployees,311
4,auditRisk,10
...,...,...
199,Q10_OPERATING_EXPENSES,334
200,Q10_EBITDA,330
201,Q10_DEPRECIATION_AND_AMORTIZATION,140
202,Q10_fiscal_year_end,2


Unnamed: 0,fetaure,data type
0,Id,int64
1,industry,object
2,sector,object
3,fullTimeEmployees,float64
4,auditRisk,float64
...,...,...
199,Q10_OPERATING_EXPENSES,float64
200,Q10_EBITDA,float64
201,Q10_DEPRECIATION_AND_AMORTIZATION,float64
202,Q10_fiscal_year_end,float64


In [10]:
train.describe()

Unnamed: 0,Id,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,forwardPE,floatShares,...,Q10_TOTAL_STOCKHOLDERS_EQUITY,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end
count,1624.0,1544.0,1131.0,1131.0,1131.0,1131.0,1131.0,722.0,1436.0,1598.0,...,1449.0,1305.0,1423.0,1321.0,1325.0,1427.0,1430.0,1421.0,1419.0,1624.0
mean,1012.267857,9345.583549,5.564103,5.654288,5.797524,5.58267,5.733864,inf,inf,226069400.0,...,2165070000.0,74836230.0,454767600.0,892565300.0,1355588000.0,119598900.0,383976500.0,68864420.0,-51369510.0,0.004926
std,585.242103,34273.510554,2.85026,2.857969,2.839455,2.769783,2.879112,,,3579918000.0,...,8504373000.0,517313400.0,2242473000.0,4225508000.0,5776379000.0,678874900.0,1677897000.0,1775192000.0,1728854000.0,0.070035
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0,8.6e-05,-2085.0,103469.0,...,-18058000000.0,-3944000000.0,-21708000000.0,-1880579000.0,-4500000.0,-4475000000.0,-1439000.0,-50905000000.0,-54182000000.0,0.0
25%,511.75,141.0,3.0,3.0,3.0,3.0,3.0,12.575714,-2.843448,15007460.0,...,22063000.0,-10888000.0,1615874.0,1202000.0,9223996.0,-7866176.0,7696162.0,-10612000.0,0.0,0.0
50%,1009.5,878.0,6.0,6.0,6.0,6.0,6.0,21.599812,9.378077,41034220.0,...,236537000.0,-203624.0,34371000.0,29869000.0,105079000.0,-54815.0,38214000.0,-286224.0,0.0,0.0
75%,1519.25,5040.75,8.0,8.0,8.0,8.0,8.0,37.157093,19.998548,106294900.0,...,997618000.0,24113000.0,210978500.0,311639000.0,603531000.0,42511500.0,174040200.0,37600000.0,0.0,0.0
max,2029.0,500000.0,10.0,10.0,10.0,10.0,10.0,inf,inf,142135700000.0,...,162282900000.0,9497000000.0,29572000000.0,66177000000.0,92400000000.0,11378000000.0,29197000000.0,26568000000.0,25514000000.0,1.0


In [11]:
test.describe()

Unnamed: 0,Id,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,forwardPE,floatShares,...,Q10_TOTAL_STOCKHOLDERS_EQUITY,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end
count,406.0,384.0,274.0,274.0,274.0,274.0,274.0,189.0,352.0,402.0,...,350.0,318.0,338.0,321.0,321.0,336.0,339.0,336.0,332.0,406.0
mean,1023.428571,8560.729167,5.540146,5.624088,5.923358,5.518248,5.759124,inf,inf,163002000.0,...,3278283000.0,78592570.0,651039200.0,852732000.0,1530937000.0,138191800.0,365195300.0,286412300.0,144951600.0,0.012315
std,590.432542,27004.049445,2.837516,2.835077,2.890803,2.820911,2.884912,,,559985600.0,...,11633430000.0,472926600.0,3871916000.0,3293652000.0,6249743000.0,636069900.0,1096210000.0,3092154000.0,2686686000.0,0.110425
min,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1e-05,-844.5,658244.0,...,-2309400000.0,-1973000000.0,-1181000000.0,-907000000.0,0.0,-1558000000.0,1100.0,-3024000000.0,-4564000000.0,0.0
25%,482.75,114.0,3.0,3.0,3.25,3.0,3.0,11.153847,-2.025302,15648190.0,...,13209930.0,-10989830.0,1317500.0,800000.0,5049000.0,-8700250.0,7633714.0,-12539500.0,0.0,0.0
50%,1045.0,838.0,6.0,6.0,6.0,6.0,6.0,20.783356,9.615789,44199330.0,...,271299000.0,-221000.0,32335000.0,25040000.0,87710000.0,275515.5,46392000.0,-230693.0,0.0,0.0
75%,1538.75,5550.0,8.0,8.0,8.0,8.0,8.0,36.75806,17.707155,111594000.0,...,1357392000.0,32536750.0,281015000.0,384200000.0,699054000.0,62493500.0,181520500.0,50553250.0,0.0,0.0
max,2026.0,339341.0,10.0,10.0,10.0,10.0,10.0,inf,inf,8945318000.0,...,127487100000.0,5105000000.0,62624000000.0,40894000000.0,70196000000.0,6739000000.0,10223000000.0,52401000000.0,45662000000.0,1.0


- there are some outlers,
- and there are some inf values

In [12]:
target_cols = [col for col in train.columns if col not in test.columns]
target_cols

['Q0_TOTAL_ASSETS',
 'Q0_TOTAL_LIABILITIES',
 'Q0_TOTAL_STOCKHOLDERS_EQUITY',
 'Q0_GROSS_PROFIT',
 'Q0_COST_OF_REVENUES',
 'Q0_REVENUES',
 'Q0_OPERATING_INCOME',
 'Q0_OPERATING_EXPENSES',
 'Q0_EBITDA']

In [13]:
train.replace([np.inf, -np.inf], np.nan, inplace=True)
test.replace([np.inf, -np.inf], np.nan, inplace=True)

## Data Preprocessing

In [14]:
missing_cat_cols = []
for col in missing_vals_train.feature:
    if train[col].dtype == "O":
        missing_cat_cols.append(col)
        display(missing_vals_train[missing_vals_train.feature == col])

Unnamed: 0,feature,missing_val_count,missing_val_percentage
1,industry,22,1.35


Unnamed: 0,feature,missing_val_count,missing_val_percentage
20,recommendationKey,20,1.23


Unnamed: 0,feature,missing_val_count,missing_val_percentage
31,financialCurrency,20,1.23


In [15]:
train["financialCurrency"] = train.financialCurrency.fillna("other")
train["sector"] = train.sector.fillna("other")
train["recommendationKey"] = train.recommendationKey.fillna("other")
train["industry"] = train.industry.fillna("other")

test["financialCurrency"] = test.financialCurrency.fillna("other")
test["sector"] = test.sector.fillna("other")
test["recommendationKey"] = test.recommendationKey.fillna("other")
test["industry"] = test.industry.fillna("other")

In [16]:
cat_cols = [col for col in df.columns if df[col].dtype == "O"]
cat_cols = [col for col in df.columns if df[col].dtype == "O"]

for col in cat_cols:
    print(f"number of categories in {col} = {df[col].nunique()}")

number of categories in industry = 113
number of categories in sector = 10
number of categories in recommendationKey = 6
number of categories in financialCurrency = 2


In [17]:
num_cols = train.columns[train.isna().sum() > 0]
given_info = train.groupby(by = ["industry", "sector"])[num_cols].apply("median").reset_index()
given_info.head(10)

Unnamed: 0,industry,sector,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,forwardPE,...,Q10_TOTAL_LIABILITIES_AND_EQUITY,Q10_TOTAL_STOCKHOLDERS_EQUITY,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION
0,Advertising Agencies,Communication Services,891.0,8.5,5.0,8.0,6.0,7.0,14.725728,7.103896,...,822552100.0,240862400.0,-5798578.0,38410000.0,46057500.0,96920000.0,335611.0,48959500.0,-4254098.0,0.0
1,Aerospace & Defense,Industrials,2400.0,6.0,7.0,5.0,6.0,6.0,32.695867,22.926315,...,1056534000.0,364384500.0,4987500.0,40356000.0,189089500.0,417902000.0,5508000.0,23823000.0,7830000.0,0.0
2,Agricultural Inputs,Basic Materials,561.0,7.5,3.5,2.5,2.5,3.0,8.091139,4.637053,...,870953000.0,485594400.0,-2561000.0,22005500.0,69994000.0,88538000.0,551000.0,23667500.0,567500.0,52500.0
3,Airlines,Industrials,9492.0,5.0,4.0,2.5,2.5,3.5,11.150175,8.318671,...,6033868000.0,804696000.0,-101660500.0,418683500.0,-9972000.0,413211500.0,-97265000.0,463404500.0,-97265000.0,0.0
4,Airports & Air Services,Industrials,1508.5,10.0,10.0,9.0,6.0,10.0,4.736842,-0.566667,...,4791118.0,3650664.0,-265666.0,243961.0,565135.0,809096.0,-255583.0,499544.0,-255583.0,0.0
5,Aluminum,Basic Materials,2878.0,1.5,3.5,2.5,5.0,2.5,82.4125,20.993381,...,1758100000.0,571650000.0,-67750000.0,20400000.0,363600000.0,384000000.0,-10000000.0,161600000.0,-141200000.0,-131200000.0
6,Apparel Manufacturing,Consumer Cyclical,1550.0,5.0,7.0,1.0,5.0,4.0,15.678192,9.251945,...,865634000.0,134264000.0,-28192500.0,300580000.0,333626000.0,651762000.0,12493000.0,207404000.0,12493000.0,0.0
7,Apparel Retail,Consumer Cyclical,3300.0,6.0,5.0,5.0,5.0,4.0,13.505515,9.488372,...,998364000.0,433534000.0,13577000.0,350015000.0,640637000.0,976765000.0,21865000.0,307311000.0,24215000.0,0.0
8,Auto & Truck Dealerships,Consumer Cyclical,6508.5,6.0,4.0,5.0,6.0,6.0,8.946026,6.942917,...,1666707000.0,811900000.0,22361000.0,153295000.0,206875000.0,270993000.0,25791000.0,136748000.0,25791000.0,599240.0
9,Auto Manufacturers,Consumer Cyclical,7200.0,9.0,8.0,8.0,7.0,9.0,7.20915,-0.299435,...,1426849000.0,656796500.0,-67866885.5,0.0,6617000000.0,3407000000.0,-51356454.0,80916500.0,-56761892.0,0.0


In [18]:
for col in num_cols:
    feature_median = train[col].median()
    train.loc[train[col].isna(), col] = train[train[col].isna()].apply(num_fillna, given_info = given_info, feature_median = feature_median,col = col, axis = 1)
    test.loc[test[col].isna(), col] = test[test[col].isna()].apply(num_fillna, given_info = given_info, feature_median = feature_median,col = col, axis = 1)

In [19]:
print(f"misisng values in train data = {train.columns[train.isna().sum() > 0]}")
print(f"misisng values in test data = {test.columns[test.isna().sum() > 0]}")

misisng values in train data = Index(['auditRisk', 'boardRisk', 'compensationRisk', 'shareHolderRightsRisk',
       'overallRisk', 'trailingPE', 'targetHighPrice', 'targetLowPrice',
       'targetMeanPrice', 'targetMedianPrice', 'recommendationMean',
       'numberOfAnalystOpinions', 'Q1_NET_INCOME', 'Q2_NET_INCOME',
       'Q3_NET_INCOME', 'Q4_NET_INCOME', 'Q5_NET_INCOME', 'Q6_NET_INCOME',
       'Q7_NET_INCOME', 'Q8_NET_INCOME', 'Q9_NET_INCOME', 'Q10_NET_INCOME'],
      dtype='object')
misisng values in test data = Index(['auditRisk', 'boardRisk', 'compensationRisk', 'shareHolderRightsRisk',
       'overallRisk', 'trailingPE', 'Q1_TOTAL_LIABILITIES',
       'Q1_TOTAL_LIABILITIES_AND_EQUITY', 'Q1_TOTAL_STOCKHOLDERS_EQUITY'],
      dtype='object')


In [20]:
for col in train.columns[train.isna().sum() > 0]:
    feature_median = train[col].median()
    train[col] = train[col].fillna(feature_median)
    test[col] = test[col].fillna(feature_median)

In [21]:
for col in test.columns[test.isna().sum() > 0]:
    feature_median = train[col].median()
    # train[col] = train[col].fillna(feature_median)
    test[col] = test[col].fillna(feature_median)

In [22]:
print(f"misisng values in train data = {train.columns[train.isna().sum() > 0]}")
print(f"misisng values in test data = {test.columns[test.isna().sum() > 0]}")

misisng values in train data = Index([], dtype='object')
misisng values in test data = Index([], dtype='object')


## Model

In [23]:
X = train.drop(target_cols, axis = 1).copy()
y = train[target_cols].copy()

In [24]:
from sklearn.model_selection import cross_val_score as cvs
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

xgb = XGBRegressor(n_jobs = -1)
rf = RandomForestRegressor(n_jobs= -1)
train_preds = {}
test_preds = {}
print('Cross validation R2 scores for each target:\n')
cross_val_score_results = {}
for target in y:

    encoder = TargetEncoder(cols = cat_cols)
    encoder.fit(X, y[target])
    X_target = encoder.transform(X)
    test_target = encoder.transform(test)

    scaler = StandardScaler()
    scaler.fit(X_target)

    X_train_sca = scaler.transform(X_target)
    test_target = scaler.transform(test_target)

    if target in ["Q0_TOTAL_LIABILITIES", "Q0_EBITDA" ,"Q0_GROSS_PROFIT", "Q0_COST_OF_REVENUES", "Q0_OPERATING_EXPENSES"]:
        score = np.round(np.mean(cvs(rf, X_train_sca, y[target], cv=3, scoring='r2')),2)
    else:
        score = np.round(np.mean(cvs(xgb, X_train_sca, y[target], cv=3, scoring='r2')),2)
    cross_val_score_results[target] = score
    
    if target in ["Q0_TOTAL_LIABILITIES", "Q0_EBITDA" ,"Q0_GROSS_PROFIT", "Q0_COST_OF_REVENUES", "Q0_OPERATING_EXPENSES"]:
        rf = RandomForestRegressor(n_jobs = -1)
        rf.fit(X_train_sca, y[target])
        train_preds[target] = rf.predict(X_train_sca)
        test_preds[target] = rf.predict(test_target)
    else:
        xgb = XGBRegressor(n_jobs = -1)
        xgb.fit(X_train_sca, y[target])
        train_preds[target] = xgb.predict(X_train_sca)
        test_preds[target] = xgb.predict(test_target)
    print(f'{target} -> {score}')
print(f'\nMean R2 score across all targets: {np.mean(list(cross_val_score_results.values()))}')

Cross validation R2 scores for each target:

Q0_TOTAL_ASSETS -> 0.75
Q0_TOTAL_LIABILITIES -> 0.7
Q0_TOTAL_STOCKHOLDERS_EQUITY -> 0.72
Q0_GROSS_PROFIT -> 0.86
Q0_COST_OF_REVENUES -> 0.92
Q0_REVENUES -> 0.93
Q0_OPERATING_INCOME -> 0.77
Q0_OPERATING_EXPENSES -> 0.82
Q0_EBITDA -> 0.47

Mean R2 score across all targets: 0.7711111111111112


In [25]:
sample_submission = pd.read_csv("A:\\project\\Synnax Technologies\\sample_submission.csv")

for target in test_preds:
    sample_submission[target] = test_preds[target]    
sample_submission.head()

sample_submission.to_csv('result.csv', index = False)