In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
# read data file 
df1 = pd.read_excel(r"C:\Users\user\OneDrive - Universiti Sains Malaysia\FYP\Dataset.xlsx")

# Set the display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Set the display option to suppress scientific notation
pd.set_option('display.float_format', lambda x: '%.6f' % x)

# review datafile
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
"CFI-Accumulated depreciation on property, plant and equipment",72.0,30.5625,259.331412,0.0,0.0,0.0,0.0,2200.5
CFI-Interest earned,72.0,1790.441667,3397.198665,0.0,38.97,191.5725,1221.530625,13347.3825
CFI-Investments - Other,72.0,1.035729,51055.09818,-336750.0,0.0,0.0,0.0,262500.0
CFI-Leasehold Improvements,72.0,1966.936875,14276.696097,0.0,0.0,0.0,0.0,119400.0
CFI-Loss on disposal of assets,72.0,-9.364583,79.461125,-674.25,0.0,0.0,0.0,0.0
CFI-Other current assets,72.0,89.190208,767.178774,-2778.9,0.0,0.0,0.0,3078.93
CFI-Other fixed assets,72.0,2848.035104,4606.104852,-2201.25,0.0,0.0,4303.125,17091.0
CFI-Other Miscellaneous Income,72.0,60.729167,344.73848,0.0,0.0,0.0,0.0,2700.0
CFO-Accrued liabilities,72.0,52471.810208,14955.816802,1051.8,45157.455,48124.4025,58520.83875,123298.4625
CFO-Advertising/Promotional,72.0,3350.0825,3520.049062,-6672.4875,968.4675,2663.1375,4517.175,17666.13


In [3]:
# List of accounts to drop
columns_to_drop = ['CA-Cash and cash equivalents', 'SE-Ordinary shares', 'SE-Retained Earnings']

# Drop the specified accounts
df1 = df1.drop(columns=columns_to_drop)

In [4]:
# Find the column with the maximum value less than 2000
columns_to_drop = []

for column in df1.columns:
    column_max = df1[column].max()
    column_min = df1[column].min()
    if column_max < 2000 and abs(column_min)<2000:
        columns_to_drop.append(column)

# Drop the columns with the maximum value less than 2000
if columns_to_drop:
    df1.drop(columns=columns_to_drop, inplace=True)
    
print('Number of remaning variables',df1.shape[1])
df1.describe().T

Number of remaning variables 74


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
"CFI-Accumulated depreciation on property, plant and equipment",72.0,30.5625,259.331412,0.0,0.0,0.0,0.0,2200.5
CFI-Interest earned,72.0,1790.441667,3397.198665,0.0,38.97,191.5725,1221.530625,13347.3825
CFI-Investments - Other,72.0,1.035729,51055.09818,-336750.0,0.0,0.0,0.0,262500.0
CFI-Leasehold Improvements,72.0,1966.936875,14276.696097,0.0,0.0,0.0,0.0,119400.0
CFI-Other current assets,72.0,89.190208,767.178774,-2778.9,0.0,0.0,0.0,3078.93
CFI-Other fixed assets,72.0,2848.035104,4606.104852,-2201.25,0.0,0.0,4303.125,17091.0
CFI-Other Miscellaneous Income,72.0,60.729167,344.73848,0.0,0.0,0.0,0.0,2700.0
CFO-Accrued liabilities,72.0,52471.810208,14955.816802,1051.8,45157.455,48124.4025,58520.83875,123298.4625
CFO-Advertising/Promotional,72.0,3350.0825,3520.049062,-6672.4875,968.4675,2663.1375,4517.175,17666.13
CFO-Auto,72.0,101.786563,319.596516,0.0,0.0,0.0,18.32625,2496.6


In [5]:
last_24 = df1.tail(24)
# Check if all values in each column are zero
all_zero_columns = last_24.columns[(last_24 == 0).all()]

df1[all_zero_columns].tail(24)

Unnamed: 0,"CFI-Accumulated depreciation on property, plant and equipment",CFI-Investments - Other,CFO-Auto,CFO-Other selling expenses,CA-Loans To Officers_eb,Auto,Other selling expenses
48,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# remove the variables that have no records in recent 2 years
df2 = df1.copy()
df2.drop(columns=all_zero_columns, inplace=True)
df2.tail(24)

Unnamed: 0,CFI-Interest earned,CFI-Leasehold Improvements,CFI-Other current assets,CFI-Other fixed assets,CFI-Other Miscellaneous Income,CFO-Accrued liabilities,CFO-Advertising/Promotional,CFO-Client Trust Accounts - Liabilities,CFO-Cost of Labour,CFO-Cost of Labour - COS,CFO-Dues and Subscriptions,CFO-Exchange Gain or Loss,CFO-GST Payable,CFO-Legal and professional fees,CFO-Loans to Others,CFO-Office/General Administrative Expenses,CFO-Other costs of sales - COS,CFO-Other current assets,CFO-Other current liabilities,CFO-Other Miscellaneous Service Cost,CFO-Payroll Expenses,CFO-Prepaid Expenses,CFO-Rent or Lease of Buildings,CFO-Repair and maintenance,CFO-Service/Fee Income,CFO-Supplies and materials - COS,CFO-Travel expenses - selling expense,CFO-Utilities,CFO-Trade and other receivables,CA-Investments - Other_eb,CA-Loans to Others_eb,CA-Other current assets_eb,CA-Prepaid Expenses_eb,CA-Trade and other receivables_eb,CL-Accrued liabilities_eb,CL-Client Trust Accounts - Liabilities_eb,CL-GST Payable_eb,CL-Other current liabilities_eb,CL-Trade and other payables_eb,"NCA-Accumulated depreciation on property, plant and equipment_eb",NCA-Deferred tax_eb,NCA-Furniture and Fixtures_eb,NCA-Leasehold Improvements_eb,NCA-Long-term investments_eb,NCA-Other fixed assets_eb,NCA-Other non-current assets_eb,Cost of Labour - COS,Other costs of sales - COS,Supplies and materials - COS,Advertising/Promotional,Bad debts,Cost of Labour,Dues and Subscriptions,Legal and professional fees,Office/General Administrative Expenses,Other Miscellaneous Service Cost,Payroll Expenses,Rent or Lease of Buildings,Repair and maintenance,Travel expenses - selling expense,Utilities,Sales - wholesale,Service/Fee Income,Depreciation,Exchange Gain or Loss,Interest earned,Other Miscellaneous Income
48,52.425,0.0,0.0,5175.0,750.0,48698.4225,2648.7,8194.875,129595.8075,5131.6125,0.0,0.0,31903.6575,3387.5625,0.0,5329.275,1772.2875,0.0,0.0,2925.0,0.0,0.0,2697.0,0.0,0.0,15670.005,0.0,923.85,289983.9,750.0,53990.07,14392.5,4278.8025,864505.92,106814.3025,62250.0,48668.4375,439814.265,11895.0075,304982.415,354803.25,42593.715,89649.75,285.0,242374.695,0.0,94633.155,2721.5475,15670.005,2648.7,0.0,4837.4775,0.0,3791.0625,5329.275,2925.0,74052.165,2697.0,0.0,0.0,923.85,87.0,260517.72,-633.4275,0.0,52.425,750.0
49,52.155,0.0,0.0,0.0,0.0,47702.34,2430.9,17792.8875,153089.9775,10968.8175,675.0,0.0,0.0,1828.5,0.0,11371.0725,4715.1375,0.0,-27.825,3300.0,1087.5,0.0,2697.0,0.0,0.0,21375.0675,0.0,1187.175,296331.0375,750.0,53990.07,14392.5,2141.3025,888247.41,114878.505,62250.0,65004.78,428984.0625,11895.0075,306565.1025,354803.25,42593.715,89649.75,285.0,242374.695,0.0,109402.815,5664.3975,21375.0675,2430.9,0.0,11232.48,675.0,2235.0,11371.0725,3300.0,90492.18,2697.0,0.0,0.0,1187.175,2731.95,282552.99,-633.4275,0.0225,52.155,0.0
50,383.97,0.0,0.0,0.0,0.0,55360.05,892.5,17955.675,159670.74,38815.7025,0.0,-4845.0525,30282.9975,12215.85,-46328.97,6858.555,3469.2,1434.0,-72.345,7470.0,825.0,0.0,2697.0,0.0,0.0,1114.8675,0.0,1329.9,451135.515,750.0,7661.1,15826.5,53927.865,814824.525,119339.865,62250.0,54685.875,482437.905,11895.0075,308147.79,354803.25,42593.715,89649.75,750640.2675,242374.695,0.0,116948.0025,4373.2575,-4729.695,1045.5,0.0,2159.1825,0.0,10237.35,4235.055,7470.0,92375.8425,2697.0,0.0,0.0,1329.9,526.44,279728.1,-678.63,4845.0525,383.97,0.0
51,1354.0575,0.0,0.0,12900.0,0.0,59414.91,397.5,18205.125,183875.52,7249.425,413.4975,-9.1725,0.0,0.0,0.0,10779.945,2918.3625,0.0,0.0,7102.5,-6315.0,60301.8375,2697.0,3270.0,0.0,11672.04,0.0,1427.6025,288205.485,750.0,7661.1,15826.5,104308.8375,844909.6875,82105.44,62250.0,71234.34,471518.2575,11895.0075,311439.69,354803.25,42593.715,89649.75,751991.4825,255274.695,0.0,119035.3425,4798.77,12243.7275,444.0,0.0,10022.76,413.4975,804.0,11018.445,7102.5,79949.88,2697.0,3270.0,0.0,1427.6025,4664.7,283549.38,-1411.4925,9.1725,1354.0575,0.0
52,1286.5875,0.0,0.0,0.0,0.0,72773.985,422.25,18510.6,148111.5,15230.37,2936.415,124.2825,31475.1225,1840.5,0.0,4398.495,4994.1375,826.545,-27.825,1875.0,-6180.0,5398.5,2697.0,0.0,0.0,15903.3225,0.0,1382.0325,371839.275,750.0,7661.1,16653.045,99789.525,788731.95,64484.4525,62250.0,56023.5375,445776.33,11895.0075,314084.715,354803.25,42593.715,89649.75,753273.615,255274.695,0.0,123791.34,6445.785,16475.01,429.75,0.0,12451.8675,2936.415,2644.5,4636.995,1917.0,77988.96,2697.0,0.0,0.0,1382.0325,508.05,299811.495,-1193.3775,-124.2825,1286.5875,0.0
53,1217.3175,0.0,0.0,8097.75,0.0,54746.49,814.4175,12982.5,149696.5875,12552.765,750.0,15.63,0.0,22657.5,0.0,12701.925,2151.3,2544.0,-27.825,5169.0,-720.0,52500.0,2697.0,553.7475,0.0,20359.4325,0.0,1298.9325,420882.24,750.0,7661.1,19197.045,136973.2125,795633.87,64599.03,67905.825,78702.45,496976.22,11895.0075,317288.34,354803.25,42593.715,89649.75,754486.5525,268770.945,0.0,122462.1375,4309.2825,23026.32,814.4175,0.0,15699.4725,750.0,23461.5,10845.225,5169.0,80106.51,2697.0,553.7475,0.0,1298.9325,1333.8,327732.3825,-1242.8925,-15.63,1217.3175,0.0
54,1198.56,0.0,0.0,2699.25,0.0,59329.5675,1211.7375,18821.3925,153838.38,10787.0625,0.0,-15.705,42414.3525,1806.525,1653.6,8659.425,12203.8875,0.0,0.0,14226.9,2175.0,-52500.0,2697.0,346.2525,0.0,12304.215,0.0,1361.6925,359689.74,750.0,9314.7,19197.045,74555.4,1045986.0225,63436.1775,67447.2825,68946.0225,709636.29,11895.0075,320271.9225,354803.25,42593.715,89649.75,755666.1,271470.195,0.0,125985.915,14049.3,12875.9025,1211.7375,0.0,11950.5075,0.0,2610.525,8897.925,14226.9,88947.96,2697.0,346.2525,0.0,1361.6925,142.8,339851.205,-1157.52,15.6975,1198.56,0.0
55,1118.0025,0.0,0.0,5398.5,922.5,57760.2075,0.0,23768.0325,154104.9,22967.25,0.0,-64.185,0.0,428.25,428.25,8451.7425,3066.7125,0.0,-27.825,8439.0,-8565.0,0.0,2697.0,1448.3625,0.0,14256.8025,0.0,1804.8375,304126.14,750.0,9742.95,19197.045,64637.5875,1202437.41,64333.7325,62250.0,93549.3975,668150.9625,11895.0075,323986.5975,354803.25,42593.715,89649.75,756774.42,276868.695,0.0,138492.5925,5317.7325,14828.49,0.0,0.0,11258.2275,0.0,1232.25,8690.2425,8439.0,79277.4675,2697.0,1448.3625,0.0,1804.8375,367.5,453241.2975,-1471.155,64.1925,1118.0025,922.5
56,1269.8475,0.0,0.0,3243.0,0.0,58251.2625,4770.0,18512.8875,158719.59,4544.0625,0.0,-84.0375,34185.1275,-1559.0625,0.0,6722.235,3201.8475,0.0,-27.825,170.475,6466.875,2053.86,2697.0,0.0,0.0,15166.44,0.0,1528.9725,675097.65,750.0,9742.95,19197.045,55746.6975,878533.7025,65924.7525,62250.0,77768.505,648203.1825,11895.0075,327667.68,354803.25,42593.715,89649.75,758025.7725,280111.695,0.0,121832.835,5480.2875,15738.1275,4770.0,0.0,11442.69,0.0,-755.0625,6960.735,170.475,99241.485,2697.0,0.0,0.0,1528.9725,459.0,327698.4225,-1690.695,84.0525,1269.8475,0.0
57,858.57,0.0,0.0,0.0,0.0,59435.7825,850.65,18556.0875,161371.1775,29426.1675,0.0,-108.2025,0.0,397.5,0.0,20779.4475,2489.2425,0.0,0.0,4419.0,-8415.0,2189.7525,2697.0,386.25,0.0,15704.3625,0.0,1953.1875,407695.23,750.0,9742.95,19197.045,46968.405,847505.9625,67430.2875,62250.0,97592.4675,636325.8975,11895.0075,337908.195,354803.25,42593.715,89649.75,758848.665,280111.695,0.0,152422.08,8838.6525,16276.05,850.65,0.0,10506.7575,0.0,804.0,21017.9475,4419.0,83698.77,2697.0,386.25,0.0,1953.1875,766.3125,343046.2575,-3986.355,108.195,858.57,0.0


In [7]:
# Check the number of zeroes in each account
def check_zeroes(df,num):
    # Check if any column has more than x zeroes
    columns_with_many_zeroes = []
    for column in df.columns:
        if (df[column] == 0).sum() > num:
            columns_with_many_zeroes.append(column)
        
    return columns_with_many_zeroes

# Call the function
num_of_zero= 60
columns_with_many_zeroes = check_zeroes(df2,num_of_zero)

# Output the results
if len(columns_with_many_zeroes) > 0:
    print("Columns with more than", num_of_zero,"zeroes:", columns_with_many_zeroes)
else:
    print("No columns have more than",num_of_zero, "zeroes.")
    

# print the number of zero
for column in df2.columns:
    num_zero = (df2[column]==0).sum()
    print(column,": ",num_zero)

Columns with more than 60 zeroes: ['CFI-Leasehold Improvements', 'CFI-Other Miscellaneous Income', 'CFO-Service/Fee Income', 'Bad debts', 'Other Miscellaneous Income']
CFI-Interest earned :  4
CFI-Leasehold Improvements :  70
CFI-Other current assets :  60
CFI-Other fixed assets :  44
CFI-Other Miscellaneous Income :  69
CFO-Accrued liabilities :  0
CFO-Advertising/Promotional :  1
CFO-Client Trust Accounts - Liabilities :  47
CFO-Cost of Labour :  0
CFO-Cost of Labour - COS :  0
CFO-Dues and Subscriptions :  41
CFO-Exchange Gain or Loss :  46
CFO-GST Payable :  29
CFO-Legal and professional fees :  18
CFO-Loans to Others :  56
CFO-Office/General Administrative Expenses :  0
CFO-Other costs of sales - COS :  0
CFO-Other current assets :  53
CFO-Other current liabilities :  42
CFO-Other Miscellaneous Service Cost :  6
CFO-Payroll Expenses :  31
CFO-Prepaid Expenses :  47
CFO-Rent or Lease of Buildings :  3
CFO-Repair and maintenance :  50
CFO-Service/Fee Income :  65
CFO-Supplies and ma

In [8]:
# Remove the variables that has more than 60 zeroes
df3 = df2.copy()
df3.drop(columns=columns_with_many_zeroes, inplace=True)

print('Number of remaning variables',df3.shape[1])

Number of remaning variables 62


In [None]:
# Assuming 'discretized_df' is your DataFrame and 'file_path' is the path where you want to save the CSV file
file_path = r"C:\Users\user\OneDrive - Universiti Sains Malaysia\FYP\after_filtering_data.csv"
df3.to_csv(file_path, index=False)