In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import metrics
from sklearn.metrics import confusion_matrix

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

from sklearn.cluster import KMeans

import missingno as msno

from fancyimpute import IterativeImputer as MICE
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense



In [7]:
df = pd.read_excel("C:\\Users\\dev\\Desktop\\financial distress\\financial_distress_python.xlsx")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23834 entries, 0 to 23833
Data columns (total 79 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   ID       23834 non-null  int64         
 1   QUARTER  23834 non-null  datetime64[ns]
 2   A13      23834 non-null  int64         
 3   A14      23834 non-null  int64         
 4   A15      23834 non-null  int64         
 5   A16      23834 non-null  int64         
 6   A17      23834 non-null  int64         
 7   A18      23834 non-null  int64         
 8   A19      23834 non-null  float64       
 9   A20      23834 non-null  int64         
 10  A22      23834 non-null  float64       
 11  A23      23834 non-null  float64       
 12  A24      23834 non-null  float64       
 13  A25      23834 non-null  float64       
 14  A26      23834 non-null  float64       
 15  A27      23834 non-null  float64       
 16  A28      23834 non-null  float64       
 17  A29      23834 non-null  float6

In [9]:
df["LABEL"] = df["LABEL"].replace({0: "Normal", 1 : "Distressed"})
df["ID"] = df["ID"].replace({0: "changed"})

In [10]:
# to take only the columns from 36 (ratios)

# Select columns from A36 to A84
selected_columns = df.loc[:, 'A36':'A84']

# Or if your column names are not sequential and 'A36' to 'A84' represents a range:
# selected_columns = df.loc[:, 'A36':'A84']

# Drop columns not in the selected range
df_selected = df.drop(columns=df.columns.difference(selected_columns.columns))
df = df_selected.copy()

In [11]:
# Count the number of zeros in each row
zeros_count = (df == 0).sum(axis=1)

# Filter out rows with more than 3 zeros
filtered_df2 = df[zeros_count <= 2]
filtered_df3 = df[zeros_count <= 3]
filtered_df4 = df[zeros_count <= 4]
filtered_df5 = df[zeros_count <= 5]
filtered_df8 = df[zeros_count <= 8]
filtered_df10 = df[zeros_count <= 10]

In [12]:
# original dataset

# Count the number of zeros in each column
zeros_count_per_column = (df == 0).sum(axis=0)

# Calculate the total number of rows
total_rows = len(df)

# Calculate the percentage of zeros and null values in each column relative to the total number of rows
percentage_zeros_count_per_column = (zeros_count_per_column / total_rows) * 100

percentage_zeros_count_per_column = percentage_zeros_count_per_column.sort_index()

print("percentage of zeros:")
for column, count in percentage_zeros_count_per_column.items():
    print(f"{column} : {round(count)} %" )
    

percentage of zeros:
A36 : 2 %
A37 : 2 %
A38 : 11 %
A39 : 14 %
A40 : 1 %
A41 : 0 %
A42 : 1 %
A43 : 7 %
A44 : 1 %
A45 : 28 %
A46 : 0 %
A47 : 0 %
A48 : 2 %
A49 : 28 %
A50 : 38 %
A53 : 34 %
A54 : 27 %
A56 : 28 %
A57 : 2 %
A58 : 2 %
A59 : 2 %
A61 : 2 %
A62 : 2 %
A63 : 35 %
A64 : 28 %
A65 : 28 %
A66 : 34 %
A67 : 0 %
A68 : 3 %
A69 : 28 %
A70 : 1 %
A71 : 2 %
A72 : 29 %
A73 : 29 %
A74 : 3 %
A75 : 0 %
A76 : 1 %
A77 : 3 %
A78 : 3 %
A79 : 3 %
A80 : 28 %
A81 : 3 %
A82 : 29 %
A83 : 28 %
A84 : 28 %


In [13]:
# for the filtered dataset

# Count the number of zeros in each column
zeros_count_per_column = (filtered_df2 == 0).sum(axis=0)

# Calculate the total number of rows
total_rows = len(filtered_df2)

# Calculate the percentage of zeros and null values in each column relative to the total number of rows
percentage_zeros_count_per_column = (zeros_count_per_column / total_rows) * 100

percentage_zeros_count_per_column = percentage_zeros_count_per_column.sort_index()

for column, count in percentage_zeros_count_per_column.items():
    print(f"{column} : {round(count)} %")

A36 : 0 %
A37 : 0 %
A38 : 1 %
A39 : 3 %
A40 : 0 %
A41 : 0 %
A42 : 0 %
A43 : 0 %
A44 : 0 %
A45 : 0 %
A46 : 0 %
A47 : 0 %
A48 : 0 %
A49 : 0 %
A50 : 0 %
A53 : 7 %
A54 : 0 %
A56 : 0 %
A57 : 0 %
A58 : 0 %
A59 : 0 %
A61 : 0 %
A62 : 0 %
A63 : 7 %
A64 : 0 %
A65 : 0 %
A66 : 3 %
A67 : 0 %
A68 : 0 %
A69 : 0 %
A70 : 0 %
A71 : 0 %
A72 : 0 %
A73 : 0 %
A74 : 0 %
A75 : 0 %
A76 : 0 %
A77 : 0 %
A78 : 0 %
A79 : 0 %
A80 : 0 %
A81 : 0 %
A82 : 0 %
A83 : 0 %
A84 : 0 %


We are using rows of data having <= 2 zeros. Since more than 2 zeros threshold data are having more than 25% null values.

We have to use filtered_df2 for all further processes from now.