In [25]:
import os

import pandas as pd

excel_data: list[pd.DataFrame] = []
csv_data: list[pd.DataFrame] = []

for root, dirs, files in os.walk('./pre_test'):
    for file in files:
        path = os.path.join(root, file)
        if file.endswith('.xlsx'):
            excel_data.append(pd.read_excel(path))
        elif file.endswith('.csv'):
            csv_data.append(pd.read_csv(path))

In [26]:
for data_frame in [*excel_data, *csv_data]:
    data_frame.drop(data_frame[-1:0].index, inplace=True)

In [27]:
data = pd.concat([*excel_data, *csv_data])

In [28]:
# print(data.dtypes)

for c in data.columns:
    if c == 'Grade/100.00' or c.startswith('Q.'):
        data[c] = data[c].map(lambda x: float(x) if x != '-' else 0)

In [29]:
data = data.drop_duplicates()

In [30]:
data = data.assign(Label='Failed')

data['Label'] = data['Grade/100.00'].map(
    lambda x: 'Passed' if x >= 60 else 'Failed'
)

data[:3]

Unnamed: 0,Surname,First name,Email address,State,Started on,Completed,Time taken,Grade/100.00,Q. 1 /1.25,Q. 2 /1.25,...,Q. 72 /1.25,Q. 73 /1.25,Q. 74 /1.25,Q. 75 /1.25,Q. 76 /1.25,Q. 77 /1.25,Q. 78 /1.25,Q. 79 /1.25,Q. 80 /1.25,Label
0,ម៉ាលីស,វ៉ាន់,vmalispg1n31@era.gov.kh,Finished,22 July 2022 6:03 AM,22 July 2022 7:23 AM,1 hour 20 mins,41.52,1.25,0.0,...,0.21,0.83,1.25,0.63,0.75,0.42,0.42,0.0,0.0,Failed
1,គឹមតៃ,ហៃ,hkimtaip22n72@era.gov.kh,Finished,22 July 2022 6:04 AM,22 July 2022 6:38 AM,33 mins 29 secs,14.54,0.0,1.25,...,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Failed
2,ប៊ុនលី,ទូច,tbunlypg2n84@era.gov.kh,Finished,22 July 2022 6:08 AM,22 July 2022 7:22 AM,1 hour 13 mins,45.25,1.25,0.0,...,0.0,1.04,0.0,0.63,0.5,0.83,1.25,1.25,0.0,Failed


In [31]:
data = data.rename(columns={'Grade/100.00': 'Total Score'})

data[:3]

Unnamed: 0,Surname,First name,Email address,State,Started on,Completed,Time taken,Total Score,Q. 1 /1.25,Q. 2 /1.25,...,Q. 72 /1.25,Q. 73 /1.25,Q. 74 /1.25,Q. 75 /1.25,Q. 76 /1.25,Q. 77 /1.25,Q. 78 /1.25,Q. 79 /1.25,Q. 80 /1.25,Label
0,ម៉ាលីស,វ៉ាន់,vmalispg1n31@era.gov.kh,Finished,22 July 2022 6:03 AM,22 July 2022 7:23 AM,1 hour 20 mins,41.52,1.25,0.0,...,0.21,0.83,1.25,0.63,0.75,0.42,0.42,0.0,0.0,Failed
1,គឹមតៃ,ហៃ,hkimtaip22n72@era.gov.kh,Finished,22 July 2022 6:04 AM,22 July 2022 6:38 AM,33 mins 29 secs,14.54,0.0,1.25,...,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Failed
2,ប៊ុនលី,ទូច,tbunlypg2n84@era.gov.kh,Finished,22 July 2022 6:08 AM,22 July 2022 7:22 AM,1 hour 13 mins,45.25,1.25,0.0,...,0.0,1.04,0.0,0.63,0.5,0.83,1.25,1.25,0.0,Failed


In [36]:
q1 = data['Total Score'].quantile(0.25)
q3 = data['Total Score'].quantile(0.75)

iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = data[
    (data['Total Score'] < lower_bound) | (data['Total Score'] > upper_bound)
]

outliers

Unnamed: 0,Surname,First name,Email address,State,Started on,Completed,Time taken,Total Score,Q. 1 /1.25,Q. 2 /1.25,...,Q. 72 /1.25,Q. 73 /1.25,Q. 74 /1.25,Q. 75 /1.25,Q. 76 /1.25,Q. 77 /1.25,Q. 78 /1.25,Q. 79 /1.25,Q. 80 /1.25,Label
382,ច័ន្ទវឌ្ឍនា,នាក់,nchanvathnah12n19@era.gov.kh,Finished,22 July 2022 4:31 PM,22 July 2022 5:48 PM,1 hour 17 mins,89.42,1.25,1.25,...,0.83,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,Passed
265,Seavmeng,Hun,hunseavmeng1@gmail.com,Finished,27 July 2022 1:32 PM,27 July 2022 2:33 PM,1 hour,85.42,1.25,1.25,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,Passed
742,SEIHA,SEAK,seiha@mptc.gov.kh,Finished,15 August 2022 10:04 AM,15 August 2022 11:24 AM,1 hour 20 mins,88.63,1.25,1.25,...,1.25,1.25,0.0,1.25,1.0,1.25,1.25,1.25,1.25,Passed
743,WINNA,PHAL,winna-phal@mptc.gov.kh,Finished,15 August 2022 10:12 AM,15 August 2022 11:30 AM,1 hour 17 mins,87.0,1.25,1.25,...,1.25,1.25,0.0,1.25,1.0,1.25,1.25,1.25,1.25,Passed
747,DARA,KANIKA,darakanikax@gmail.com,Finished,15 August 2022 12:49 PM,15 August 2022 2:04 PM,1 hour 14 mins,87.21,1.25,1.25,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,Passed
749,SAMBAT RATHANA,CHHORN,Chhornsambatrathana21@gmail.com,Finished,15 August 2022 1:31 PM,15 August 2022 2:46 PM,1 hour 15 mins,85.75,1.25,1.25,...,1.25,1.25,1.25,0.63,1.25,1.25,1.25,1.25,1.25,Passed
751,CHAMROEURN,SORN,chamroeurnsorn789@gmail.com,Finished,15 August 2022 2:10 PM,15 August 2022 3:24 PM,1 hour 13 mins,85.25,0.83,1.25,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,Passed
756,DINE,SEN,sendine1998@gmail.com,Finished,15 August 2022 3:03 PM,15 August 2022 4:14 PM,1 hour 11 mins,87.77,1.25,1.25,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,Passed
782,VUCH Y,SEA,seavuchy@gmail.com,Finished,16 August 2022 4:24 PM,16 August 2022 4:57 PM,33 mins 6 secs,86.54,1.25,1.25,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,Passed
783,BOPHA,YUS,yusbopha@gmail.com,Finished,16 August 2022 5:06 PM,16 August 2022 5:49 PM,43 mins 10 secs,87.13,1.25,1.25,...,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,1.25,Passed


In [33]:
data.drop(data.loc[data['State'] != "Finished"].index)
data = data.reindex()

In [37]:
print(len(data))
data.dropna()
print(len(data))

3200
3200


In [39]:
data.to_excel('pre_test_output.xlsx')