In [1]:
import pandas as pd
import numpy as np
import sys, os

this_path = '/home/ibi/Documents/GitHub/diem-thpt-analysis'
sys.path.append(this_path)
os.chdir(this_path)

In [2]:
files = os.listdir('data/raw')
files

['2018.csv',
 '2021.csv',
 '2020.csv',
 '2019.csv',
 '2022.csv',
 '2023.csv',
 '2024.csv']

In [3]:
def read_and_add_year(file):
    year = file.split('_')[0][:-4]
    df = pd.read_csv(f'data/raw/{file}', engine='pyarrow')
    year_col =  pd.Series([year]*len(df))
    df.insert(0, 'Year', year_col)

    return df

In [4]:
def process_dtype(df):
    int_cols = ['Year', 'CityCode', 'StudentID']
    for col in int_cols:
        df[col] = df[col].astype('int32')
    for col in df.columns[3:]:
        df[col] = df[col].astype('float32')
        df[col] = df[col].replace(-1, np.nan)

    return df

In [5]:
dfs = [process_dtype(read_and_add_year(file)) for file in files]
df = pd.concat(dfs)
df

Unnamed: 0,Year,CityCode,StudentID,Toan,NguVan,NgoaiNgu,VatLi,HoaHoc,SinhHoc,KHTN,DiaLi,LichSu,GDCD,KHXH
0,2018,1,1000033,7.6,8.25,7.0,,,,,7.25,4.25,7.75,6.420000
1,2018,1,1000034,6.2,6.25,9.2,,,,,7.00,5.25,8.50,6.920000
2,2018,1,1000035,6.4,7.25,6.6,,,,,5.00,3.75,7.25,5.330000
3,2018,1,1000037,2.8,7.00,4.2,,,,,3.50,3.50,7.25,4.750000
4,2018,1,1000038,4.0,6.50,4.8,,,,,6.25,3.25,7.00,5.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060995,2024,64,64006675,8.2,8.25,7.0,9.25,8.50,6.00,7.916667,,,,
1060996,2024,64,64006673,4.8,7.00,,,,,,7.25,6.50,,6.875000
1060997,2024,64,64006674,5.8,8.25,4.0,6.75,5.75,4.25,5.583333,,,,
1060998,2024,64,64006679,6.0,7.08,4.0,,,,,6.00,5.00,7.50,6.166667


In [6]:
subjects = df.columns[3:]
for subject in subjects:
    df = df[df[subject] != 0]

df

Unnamed: 0,Year,CityCode,StudentID,Toan,NguVan,NgoaiNgu,VatLi,HoaHoc,SinhHoc,KHTN,DiaLi,LichSu,GDCD,KHXH
0,2018,1,1000033,7.6,8.25,7.0,,,,,7.25,4.25,7.75,6.420000
1,2018,1,1000034,6.2,6.25,9.2,,,,,7.00,5.25,8.50,6.920000
2,2018,1,1000035,6.4,7.25,6.6,,,,,5.00,3.75,7.25,5.330000
3,2018,1,1000037,2.8,7.00,4.2,,,,,3.50,3.50,7.25,4.750000
4,2018,1,1000038,4.0,6.50,4.8,,,,,6.25,3.25,7.00,5.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060995,2024,64,64006675,8.2,8.25,7.0,9.25,8.50,6.00,7.916667,,,,
1060996,2024,64,64006673,4.8,7.00,,,,,,7.25,6.50,,6.875000
1060997,2024,64,64006674,5.8,8.25,4.0,6.75,5.75,4.25,5.583333,,,,
1060998,2024,64,64006679,6.0,7.08,4.0,,,,,6.00,5.00,7.50,6.166667


In [7]:
subjects_to_check = ['VatLi', 'HoaHoc', 'SinhHoc', 'DiaLi', 'LichSu', 'GDCD']
df.dropna(axis=0, subset=subjects_to_check, how='all', inplace=True)

df

Unnamed: 0,Year,CityCode,StudentID,Toan,NguVan,NgoaiNgu,VatLi,HoaHoc,SinhHoc,KHTN,DiaLi,LichSu,GDCD,KHXH
0,2018,1,1000033,7.6,8.25,7.0,,,,,7.25,4.25,7.75,6.420000
1,2018,1,1000034,6.2,6.25,9.2,,,,,7.00,5.25,8.50,6.920000
2,2018,1,1000035,6.4,7.25,6.6,,,,,5.00,3.75,7.25,5.330000
3,2018,1,1000037,2.8,7.00,4.2,,,,,3.50,3.50,7.25,4.750000
4,2018,1,1000038,4.0,6.50,4.8,,,,,6.25,3.25,7.00,5.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060995,2024,64,64006675,8.2,8.25,7.0,9.25,8.50,6.00,7.916667,,,,
1060996,2024,64,64006673,4.8,7.00,,,,,,7.25,6.50,,6.875000
1060997,2024,64,64006674,5.8,8.25,4.0,6.75,5.75,4.25,5.583333,,,,
1060998,2024,64,64006679,6.0,7.08,4.0,,,,,6.00,5.00,7.50,6.166667


In [8]:
df.dropna(axis=0, subset=['Toan', 'NguVan'], how='any', inplace=True)

df

Unnamed: 0,Year,CityCode,StudentID,Toan,NguVan,NgoaiNgu,VatLi,HoaHoc,SinhHoc,KHTN,DiaLi,LichSu,GDCD,KHXH
0,2018,1,1000033,7.6,8.25,7.0,,,,,7.25,4.25,7.75,6.420000
1,2018,1,1000034,6.2,6.25,9.2,,,,,7.00,5.25,8.50,6.920000
2,2018,1,1000035,6.4,7.25,6.6,,,,,5.00,3.75,7.25,5.330000
3,2018,1,1000037,2.8,7.00,4.2,,,,,3.50,3.50,7.25,4.750000
4,2018,1,1000038,4.0,6.50,4.8,,,,,6.25,3.25,7.00,5.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060994,2024,64,64006669,3.2,5.00,,,,,,6.25,4.25,,5.250000
1060995,2024,64,64006675,8.2,8.25,7.0,9.25,8.50,6.00,7.916667,,,,
1060996,2024,64,64006673,4.8,7.00,,,,,,7.25,6.50,,6.875000
1060997,2024,64,64006674,5.8,8.25,4.0,6.75,5.75,4.25,5.583333,,,,


In [10]:
%%timeit -n 1 -r 5
df.to_csv('data/preprocessed/thpt_total.csv', index=False)

42.9 s ± 2.68 s per loop (mean ± std. dev. of 5 runs, 1 loop each)
