# Feature Engineering - Last Dance



In [34]:
import os
import random
import numpy as np
import pandas as pd

# Seed cố định để kết quả reproducible
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

DATA_DIR = 'data'
os.makedirs(DATA_DIR, exist_ok=True)



In [35]:
# === Định nghĩa hàm create_features cho Y1 (giữ nguyên như cũ) ===

def create_features_y1_sem1(df_in):
    """Hàm tạo features cho Y1 Sem1: Rank_In_Major, Gap_Score, TOHOP_GROUP, PTXT_GROUP"""
    df = df_in.copy()
    
    # Gap_Score
    df['Gap_Score'] = (df['DIEM_TRUNGTUYEN'].fillna(0) - df['DIEM_CHUAN'].fillna(0))
    
    # Rank_In_Major: Percentile rank trong cùng khóa + tổ hợp (chỉ Y1 Sem1)
    df['Rank_In_Major'] = 0.0
    if 'NAM_TUYENSINH' in df.columns and 'TOHOP_XT' in df.columns:
        fresh_df = df.copy()
        fresh_df['_n'] = fresh_df['NAM_TUYENSINH'].astype(str).str.strip()
        fresh_df['_t'] = fresh_df['TOHOP_XT'].astype(str).str.strip()
        r = fresh_df.groupby(['_n', '_t'])['DIEM_TRUNGTUYEN'].fillna(0).rank(pct=True)
        df['Rank_In_Major'] = r.fillna(0.5)
    
    # TOHOP_GROUP: Khoa học tự nhiên (A,B), Xã hội (C), Khác (D,...)
    if 'TOHOP_XT' in df.columns:
        _first = df['TOHOP_XT'].astype(str).str.strip().str.upper().str[0]
        df['TOHOP_GROUP'] = np.where(_first.isin(['A', 'B']), 'KHTN', 
                                     np.where(_first == 'C', 'XH', 'Khac'))
    else:
        df['TOHOP_GROUP'] = 'Khac'
    
    # PTXT_GROUP: Giữ nguyên PTXT nếu là giá trị phổ biến, còn lại gán "1"
    if 'PTXT' in df.columns:
        df['PTXT_GROUP'] = df['PTXT'].astype(str)
    else:
        df['PTXT_GROUP'] = '1'
    
    return df

# Feature cho Y1: dùng FEATS_Y1
FEATS_Y1 = ['Rank_In_Major', 'Gap_Score', 'TOHOP_GROUP', 'PTXT_GROUP']

# === Định nghĩa hàm create_features cho Y2+ (giống train_by_year_predict.ipynb) ===

def create_features_y2plus(df_in):
    """Hàm chung cho Train và Test. Xử lý Lag, History, Load_Ratio, is_Covid, Gap_Score, SV năm 1 kỳ 1."""
    df = df_in.copy()
    
    # Chuẩn hóa cột
    if 'Hoc_Ky' not in df.columns and 'HOC_KY' in df.columns:
        df['Hoc_Ky'] = df['HOC_KY']
    if 'Nam_Hoc' not in df.columns and 'NAM_HOC' in df.columns:
        df['Nam_Hoc'] = df['NAM_HOC']
    df['Nam_Hoc'] = df['Nam_Hoc'].astype(str)
    df['_year'] = df['Nam_Hoc'].str.split('-').str[0].astype(float)
    df['_hk'] = pd.to_numeric(df['Hoc_Ky'], errors='coerce').fillna(1).astype(int)
    df = df.sort_values(['MA_SO_SV', '_year', '_hk']).reset_index(drop=True)

    # Gap_Score, is_Covid
    df['Gap_Score'] = (df['DIEM_TRUNGTUYEN'].fillna(0) - df['DIEM_CHUAN'].fillna(0))
    df['is_Covid'] = df['Nam_Hoc'].isin(['2020-2021', '2021-2022']).astype(int)

    grp = df.groupby('MA_SO_SV')
    
    # Lag: Prev_GPA, Prev_CPA (shift 1 kỳ)
    df['Prev_CPA'] = grp['CPA'].shift(1)
    df['Prev_GPA'] = grp['GPA'].shift(1)
    
    # History: History_Completion_Rate, History_Fail_Credits, Avg_History_Load
    cum_reg = grp['TC_DANGKY'].cumsum().shift(1)
    cum_comp = grp['TC_HOANTHANH'].fillna(0).cumsum().shift(1)
    df['History_Completion_Rate'] = np.where(cum_reg > 0, cum_comp / cum_reg, 0)
    df['_fail'] = (df['TC_DANGKY'].fillna(0) - df['TC_HOANTHANH'].fillna(0)).clip(lower=0)
    df['History_Fail_Credits'] = np.maximum(0, grp['_fail'].cumsum().shift(1))
    df['Semester_No'] = grp.cumcount() + 1
    avg_load = grp['TC_HOANTHANH'].fillna(0).cumsum().shift(1) / np.maximum(df['Semester_No'] - 1, 1)
    avg_load = np.where(avg_load <= 0, df['TC_DANGKY'].fillna(0), avg_load)
    df['Avg_History_Load'] = avg_load
    df['Load_Ratio'] = np.where(avg_load > 0, df['TC_DANGKY'] / avg_load, 1.0)

    # Xử lý SV năm 1 kỳ 1: Prev_CPA/Prev_GPA = (DIEM_TRUNGTUYEN/30)*4, History=1, Load_Ratio=1
    is_fresh = (df['Semester_No'] == 1)
    diem_proxy = (df['DIEM_TRUNGTUYEN'].fillna(0) / 30) * 4
    df.loc[is_fresh, 'Prev_CPA'] = diem_proxy[is_fresh]
    df.loc[is_fresh, 'Prev_GPA'] = diem_proxy[is_fresh]
    df.loc[is_fresh, 'History_Completion_Rate'] = 1.0
    df.loc[is_fresh, 'History_Fail_Credits'] = 0
    df.loc[is_fresh, 'Load_Ratio'] = 1.0

    # Y2+: TOHOP_GROUP (KHTN / XH / Khác) và PTXT_GROUP — thay thế TOHOP_XT, PTXT
    if 'TOHOP_XT' in df.columns:
        _first = df['TOHOP_XT'].astype(str).str.strip().str.upper().str[0]
        df['TOHOP_GROUP'] = np.where(_first.isin(['A', 'B']), 'KHTN', np.where(_first == 'C', 'XH', 'Khac'))
        df = df.drop(columns=['TOHOP_XT'], errors='ignore')
    else:
        df['TOHOP_GROUP'] = 'Khac'
    if 'PTXT' in df.columns:
        df['PTXT_GROUP'] = df['PTXT'].astype(str)
        df = df.drop(columns=['PTXT'], errors='ignore')
    else:
        df['PTXT_GROUP'] = '1'

    return df

# Định nghĩa feature cho Y2+ (dùng TOHOP_GROUP, PTXT_GROUP như Y1)
NUMERIC_FEATURES_Y2PLUS = [
    'Load_Ratio', 'Prev_CPA', 'Prev_GPA',
    'History_Completion_Rate', 'History_Fail_Credits',
    'Semester_No', 'Gap_Score', 'is_Covid'
]
CATEGORICAL_FEATURES_Y2PLUS = ['TOHOP_GROUP', 'PTXT_GROUP']
FEAT_COLS_Y2PLUS = NUMERIC_FEATURES_Y2PLUS + CATEGORICAL_FEATURES_Y2PLUS

print("Đã định nghĩa create_features_y2plus và FEAT_COLS_Y2PLUS (cho Y2-Y6)")
print(f"NUMERIC_FEATURES: {NUMERIC_FEATURES_Y2PLUS}")
print(f"CATEGORICAL_FEATURES (Y2+): {CATEGORICAL_FEATURES_Y2PLUS}")

# Load dữ liệu đã được xử lý từ data_cleaning và data_preprocessing

# Load final_combined_cleaned (train + valid)
path_final = os.path.join(DATA_DIR, 'final_combined_cleaned.csv')
if os.path.exists(path_final):
    combined_cleaned = pd.read_csv(path_final)
    print(f'\n✓ Loaded final_combined_cleaned: {combined_cleaned.shape}')
else:
    combined_cleaned = None
    print(f'\n✗ Chưa tìm thấy {path_final} – hãy chạy data_cleaning và data_preprocessing trước.')
test_files = {}
# Y1: chỉ cần test_Y1_Sem1.csv
path_y1 = os.path.join(DATA_DIR, 'test_Y1_Sem1.csv')
if os.path.exists(path_y1):
    test_files['Y1_Sem1'] = pd.read_csv(path_y1)
    print(f'✓ Loaded test_Y1_Sem1: {test_files["Y1_Sem1"].shape}')
else:
    print('✗ Chưa tìm thấy test_Y1_Sem1.csv')

# Test Y2+: load 1 file test_con_lai.csv (data_preprocessing không lưu test_Y2..Y7 riêng nữa)
path_con_lai = os.path.join(DATA_DIR, 'test_con_lai.csv')
test_con_lai_df = pd.read_csv(path_con_lai) if os.path.exists(path_con_lai) else None
if test_con_lai_df is not None:
    print(f'✓ Loaded test_con_lai: {test_con_lai_df.shape}')

# Load train files
train_files = {}
for year in range(1, 7):
    path_train = os.path.join(DATA_DIR, f'train_year_{year}.csv')
    if os.path.exists(path_train):
        train_files[f'Y{year}'] = pd.read_csv(path_train)
        print(f'✓ Loaded train_year_{year}: {train_files[f"Y{year}"].shape}')

# Load valid: 1 file valid.csv (data_preprocessing không chia theo năm), tách theo Year_of_Study thành valid_files
valid_files = {}
path_valid = os.path.join(DATA_DIR, 'valid.csv')
if os.path.exists(path_valid):
    valid_df = pd.read_csv(path_valid)
    if 'Year_of_Study' not in valid_df.columns and '_year' in valid_df.columns:
        valid_df['Year_of_Study'] = (valid_df['_year'] - valid_df['NAM_TUYENSINH'].fillna(valid_df['_year']) + 1).clip(lower=1).fillna(1).astype(int)
    if 'Year_of_Study' in valid_df.columns:
        for year in range(1, 7):
            sub = valid_df[valid_df['Year_of_Study'] == year].copy()
            if len(sub) > 0:
                valid_files[f'Y{year}'] = sub
        print(f'✓ Loaded valid.csv: {len(valid_df)} rows → tách thành {len(valid_files)} nhóm năm')
    else:
        valid_files['Y1'] = valid_df
        print(f'✓ Loaded valid.csv: {len(valid_df)} rows (không có Year_of_Study, coi toàn bộ là Y1)')
else:
    print('✗ Chưa tìm thấy valid.csv – hãy chạy data_preprocessing trước.')

print(f'\nTổng kết:')
print(f' - Combined cleaned: {"✓" if combined_cleaned is not None else "✗"}')
print(f' - Test files: {len(test_files)} files (có thể bao gồm Y7)')
print(f' - Train files: {len(train_files)}/{6} files')
print(f' - Valid: 1 file → {len(valid_files)} nhóm năm' if valid_files else " - Valid: chưa load")

Đã định nghĩa create_features_y2plus và FEAT_COLS_Y2PLUS (cho Y2-Y6)
NUMERIC_FEATURES: ['Load_Ratio', 'Prev_CPA', 'Prev_GPA', 'History_Completion_Rate', 'History_Fail_Credits', 'Semester_No', 'Gap_Score', 'is_Covid']
CATEGORICAL_FEATURES (Y2+): ['TOHOP_GROUP', 'PTXT_GROUP']

✓ Loaded final_combined_cleaned: (105726, 15)
✓ Loaded test_Y1_Sem1: (4326, 11)
✓ Loaded test_con_lai: (12176, 11)
✓ Loaded train_year_1: (25027, 19)
✓ Loaded train_year_2: (23154, 19)
✓ Loaded train_year_3: (20940, 19)
✓ Loaded train_year_4: (13848, 19)
✓ Loaded train_year_5: (6822, 19)
✓ Loaded train_year_6: (791, 19)
✓ Loaded valid.csv: 15144 rows → tách thành 6 nhóm năm

Tổng kết:
 - Combined cleaned: ✓
 - Test files: 1 files (có thể bao gồm Y7)
 - Train files: 6/6 files
 - Valid: 1 file → 6 nhóm năm


## I. Tạo features cho Y1


In [36]:
# I.1 Y1 dùng create_features_y1_sem1 và FEATS_Y1 
# Hàm create_features_y1_sem1 đã được định nghĩa ở cell trên

In [37]:
# I.2 Tạo features cho test Y1 (dùng create_features_y1_sem1)

test_y1_feats = None
if 'Y1_Sem1' in test_files:
    test_y1_raw = test_files['Y1_Sem1'].copy()
    test_y1_feats = create_features_y1_sem1(test_y1_raw)
    print(f'Đã tạo features cho test Y1: {test_y1_feats.shape}')
    new_cols = [c for c in test_y1_feats.columns if c not in test_y1_raw.columns]
    print(f'Các features mới: {new_cols[:10]}' if len(new_cols) > 10 else f'Các features mới: {new_cols}')
else:
    print('Chưa có test_Y1_Sem1, hãy chạy data_preprocessing trước.')

Đã tạo features cho test Y1: (4326, 15)
Các features mới: ['Gap_Score', 'Rank_In_Major', 'TOHOP_GROUP', 'PTXT_GROUP']


  r = fresh_df.groupby(['_n', '_t'])['DIEM_TRUNGTUYEN'].fillna(0).rank(pct=True)


In [38]:
# I.2b Tạo features cho train Y1 và valid Y1 (dùng create_features_y1_sem1)

train_y1_feats = None
valid_y1_feats = None

if 'Y1' in train_files:
    train_y1_raw = train_files['Y1'].copy()
    train_y1_feats = create_features_y1_sem1(train_y1_raw)
    print(f'Đã tạo features cho train Y1: {train_y1_feats.shape}')
if 'Y1' in valid_files:
    valid_y1_raw = valid_files['Y1'].copy()
    valid_y1_feats = create_features_y1_sem1(valid_y1_raw)
    print(f'Đã tạo features cho valid Y1: {valid_y1_feats.shape}')

  r = fresh_df.groupby(['_n', '_t'])['DIEM_TRUNGTUYEN'].fillna(0).rank(pct=True)
  r = fresh_df.groupby(['_n', '_t'])['DIEM_TRUNGTUYEN'].fillna(0).rank(pct=True)


Đã tạo features cho train Y1: (25027, 23)
Đã tạo features cho valid Y1: (3504, 23)


In [39]:
# I.3 Xử lý PTXT_GROUP cho Y1: Tạo PTXT_TOP từ train data và gán cho train/valid/test

# Tính PTXT_TOP từ train Y1 (các giá trị phổ biến nhất)
if train_y1_feats is not None and 'PTXT' in train_y1_feats.columns:
    ptxt_counts = train_y1_feats['PTXT'].astype(str).value_counts()
    PTXT_TOP = ptxt_counts.head(10).index.tolist()
    print(f'PTXT_TOP (10 giá trị phổ biến nhất): {PTXT_TOP}')
    
    # Gán PTXT_GROUP: giữ nguyên nếu trong TOP, còn lại gán "1"
    for df_name, df_feats in [('train', train_y1_feats), ('valid', valid_y1_feats), ('test', test_y1_feats if 'test_y1_feats' in locals() else None)]:
        if df_feats is not None and 'PTXT' in df_feats.columns:
            df_feats['PTXT_GROUP'] = df_feats['PTXT'].astype(str)
            df_feats.loc[~df_feats['PTXT_GROUP'].isin(PTXT_TOP), 'PTXT_GROUP'] = '1'
    print('Đã tạo PTXT_GROUP cho train, valid, test Y1')
else:
    PTXT_TOP = []
    print('⚠ Chưa có train_y1_feats hoặc cột PTXT để tính PTXT_TOP')

PTXT_TOP (10 giá trị phổ biến nhất): ['1', '100', '409', '402', '500', '200']
Đã tạo PTXT_GROUP cho train, valid, test Y1


In [40]:
# I.4 Gán features cho Y1 (dùng FEATS_Y1)

train_y1_final = None
valid_y1_final = None
test_y1_final = None

if train_y1_feats is not None:
    cols = [c for c in FEATS_Y1 if c in train_y1_feats.columns]
    train_y1_final = train_y1_feats[['MA_SO_SV'] + cols].copy()
    print(f'Train Y1 features: {train_y1_final.shape}')
if valid_y1_feats is not None:
    cols = [c for c in FEATS_Y1 if c in valid_y1_feats.columns]
    valid_y1_final = valid_y1_feats[['MA_SO_SV'] + cols].copy()
    print(f'Valid Y1 features: {valid_y1_final.shape}')
if test_y1_feats is not None:
    cols = [c for c in FEATS_Y1 if c in test_y1_feats.columns]
    test_y1_final = test_y1_feats[['MA_SO_SV'] + cols].copy()
    print(f'Test Y1 features: {test_y1_final.shape}')
    print(f'Features: {cols}')
    display(test_y1_final.head())

Train Y1 features: (25027, 5)
Valid Y1 features: (3504, 5)
Test Y1 features: (4326, 5)
Features: ['Rank_In_Major', 'Gap_Score', 'TOHOP_GROUP', 'PTXT_GROUP']


Unnamed: 0,MA_SO_SV,Rank_In_Major,Gap_Score,TOHOP_GROUP,PTXT_GROUP
0,7f9d7b4e7e62,0.955155,5.31,KHTN,100
1,8c9226f36525,0.256241,5.68,Khac,100
2,0b58da4d395d,0.348706,1.24,KHTN,100
3,0e73af55556e,0.953421,4.85,Khac,100
4,c15afd37f75b,0.51387,2.37,KHTN,100


In [41]:
# I.5 Lưu features cho Y1 (chỉ train và test; valid gộp vào valid_features.csv ở II.2)

saved = []
if train_y1_final is not None:
    p = os.path.join(DATA_DIR, 'train_Y1_features.csv')
    train_y1_final.to_csv(p, index=False)
    saved.append(f'train: {p} ({len(train_y1_final)} rows)')
if test_y1_final is not None:
    p = os.path.join(DATA_DIR, 'test_Y1_features.csv')
    test_y1_final.to_csv(p, index=False)
    saved.append(f'test: {p} ({len(test_y1_final)} rows)')

for s in saved:
    print(f'Đã lưu {s}')
if not saved:
    print('Chưa có dữ liệu features để lưu (cần chạy data_preprocessing và lưu valid trước).')

Đã lưu train: data/train_Y1_features.csv (25027 rows)
Đã lưu test: data/test_Y1_features.csv (4326 rows)


In [42]:
# === II.0 Hàm create_features cho Y2+ ===

def create_features_y2plus(df_in):
    df = df_in.copy()
    
    # Chuẩn hóa cột
    if 'Hoc_Ky' not in df.columns and 'HOC_KY' in df.columns:
        df['Hoc_Ky'] = df['HOC_KY']
    if 'Nam_Hoc' not in df.columns and 'NAM_HOC' in df.columns:
        df['Nam_Hoc'] = df['NAM_HOC']
    df['Nam_Hoc'] = df['Nam_Hoc'].astype(str)
    df['_year'] = df['Nam_Hoc'].str.split('-').str[0].astype(float)
    df['_hk'] = pd.to_numeric(df['Hoc_Ky'], errors='coerce').fillna(1).astype(int)
    df = df.sort_values(['MA_SO_SV', '_year', '_hk']).reset_index(drop=True)

    # Gap_Score, is_Covid
    df['Gap_Score'] = (df['DIEM_TRUNGTUYEN'].fillna(0) - df['DIEM_CHUAN'].fillna(0))
    df['is_Covid'] = df['Nam_Hoc'].isin(['2020-2021', '2021-2022']).astype(int)

    grp = df.groupby('MA_SO_SV')
    
    # Lag: Prev_GPA, Prev_CPA (shift 1 kỳ)
    df['Prev_CPA'] = grp['CPA'].shift(1)
    df['Prev_GPA'] = grp['GPA'].shift(1)
    
    # History: History_Completion_Rate, History_Fail_Credits, Avg_History_Load
    cum_reg = grp['TC_DANGKY'].cumsum().shift(1)
    cum_comp = grp['TC_HOANTHANH'].fillna(0).cumsum().shift(1)
    df['History_Completion_Rate'] = np.where(cum_reg > 0, cum_comp / cum_reg, 0)
    df['_fail'] = (df['TC_DANGKY'].fillna(0) - df['TC_HOANTHANH'].fillna(0)).clip(lower=0)
    df['History_Fail_Credits'] = np.maximum(0, grp['_fail'].cumsum().shift(1))
    df['Semester_No'] = grp.cumcount() + 1
    avg_load = grp['TC_HOANTHANH'].fillna(0).cumsum().shift(1) / np.maximum(df['Semester_No'] - 1, 1)
    avg_load = np.where(avg_load <= 0, df['TC_DANGKY'].fillna(0), avg_load)
    df['Avg_History_Load'] = avg_load
    df['Load_Ratio'] = np.where(avg_load > 0, df['TC_DANGKY'] / avg_load, 1.0)

    # Xử lý SV năm 1 kỳ 1: Prev_CPA/Prev_GPA = (DIEM_TRUNGTUYEN/30)*4, History=1, Load_Ratio=1
    is_fresh = (df['Semester_No'] == 1)
    diem_proxy = (df['DIEM_TRUNGTUYEN'].fillna(0) / 30) * 4
    df.loc[is_fresh, 'Prev_CPA'] = diem_proxy[is_fresh]
    df.loc[is_fresh, 'Prev_GPA'] = diem_proxy[is_fresh]
    df.loc[is_fresh, 'History_Completion_Rate'] = 1.0
    df.loc[is_fresh, 'History_Fail_Credits'] = 0
    df.loc[is_fresh, 'Load_Ratio'] = 1.0

    # Y2+: TOHOP_GROUP (KHTN/XH/Khác), PTXT_GROUP — thay thế TOHOP_XT, PTXT (trùng với cell 2)
    if 'TOHOP_XT' in df.columns:
        _first = df['TOHOP_XT'].astype(str).str.strip().str.upper().str[0]
        df['TOHOP_GROUP'] = np.where(_first.isin(['A', 'B']), 'KHTN', np.where(_first == 'C', 'XH', 'Khac'))
        df = df.drop(columns=['TOHOP_XT'], errors='ignore')
    else:
        df['TOHOP_GROUP'] = 'Khac'
    if 'PTXT' in df.columns:
        df['PTXT_GROUP'] = df['PTXT'].astype(str)
        df = df.drop(columns=['PTXT'], errors='ignore')
    else:
        df['PTXT_GROUP'] = '1'

    return df

# Định nghĩa feature cho Y2+ (dùng TOHOP_GROUP, PTXT_GROUP — giống cell 2)
NUMERIC_FEATURES_Y2PLUS = [
    'Load_Ratio', 'Prev_CPA', 'Prev_GPA',
    'History_Completion_Rate', 'History_Fail_Credits',
    'Semester_No', 'Gap_Score', 'is_Covid'
]
CATEGORICAL_FEATURES_Y2PLUS = ['TOHOP_GROUP', 'PTXT_GROUP']
FEAT_COLS_Y2PLUS = NUMERIC_FEATURES_Y2PLUS + CATEGORICAL_FEATURES_Y2PLUS

print("Đã định nghĩa create_features_y2plus và FEAT_COLS_Y2PLUS")
print(f"NUMERIC_FEATURES: {NUMERIC_FEATURES_Y2PLUS}")
print(f"CATEGORICAL_FEATURES (Y2+): {CATEGORICAL_FEATURES_Y2PLUS}")

Đã định nghĩa create_features_y2plus và FEAT_COLS_Y2PLUS
NUMERIC_FEATURES: ['Load_Ratio', 'Prev_CPA', 'Prev_GPA', 'History_Completion_Rate', 'History_Fail_Credits', 'Semester_No', 'Gap_Score', 'is_Covid']
CATEGORICAL_FEATURES (Y2+): ['TOHOP_GROUP', 'PTXT_GROUP']


## II. Tạo features cho Y2, Y3, Y4, Y5, Y6, Y7

In [43]:
# II.1 (Legacy) FEATS_Y2_Y7 - không dùng; Y2-Y6 dùng FEAT_COLS_Y2PLUS (giống train_by_year)

FEATS_Y2_Y7 = [
    'Prev_GPA', 'Prev_CPA',
    'Recent_Completion_Rate', 'LongTerm_Completion_Rate',
    'is_Covid', 'Semester_No', 'Gap_Semester_Count',
    'GPA_Momentum', 'Workload_Risk',
    'Record_Gap_Flag', 'Recovery_Potential', 'Heavy_Load_Flag',
]
print('FEATS_Y2_Y7:', FEATS_Y2_Y7)

FEATS_Y2_Y7: ['Prev_GPA', 'Prev_CPA', 'Recent_Completion_Rate', 'LongTerm_Completion_Rate', 'is_Covid', 'Semester_No', 'Gap_Semester_Count', 'GPA_Momentum', 'Workload_Risk', 'Record_Gap_Flag', 'Recovery_Potential', 'Heavy_Load_Flag']


In [44]:
# II.2 Tạo features: train/valid theo từng năm; test Y2+ một file gộp (có Year_of_Study) giống train_by_year

try:
    _ = PTXT_TOP
except NameError:
    PTXT_TOP = []
PTXT_TOP_Y2PLUS = []

# Tính features đầy đủ từ combined_cleaned
if combined_cleaned is not None:
    print('Tính features đầy đủ từ final_combined_cleaned (Y2+ dùng create_features_y2plus)...')
    combined_feats = create_features_y2plus(combined_cleaned.copy())
    if 'Year_of_Study' not in combined_feats.columns:
        combined_feats['_year_start'] = combined_feats['Nam_Hoc'].str.split('-').str[0].astype(float)
        combined_feats['Year_of_Study'] = (combined_feats['_year_start'] - combined_feats['NAM_TUYENSINH'] + 1).clip(lower=1).fillna(1).astype(int)
        combined_feats = combined_feats.drop(columns=['_year_start'], errors='ignore')
    train_mask = (
        ((combined_feats['Nam_Hoc'] >= '2020-2021') & (combined_feats['Nam_Hoc'] < '2023-2024')) |
        ((combined_feats['Nam_Hoc'] == '2023-2024') & (combined_feats['Hoc_Ky'] == 1))
    )
    valid_mask = (combined_feats['Nam_Hoc'] == '2023-2024') & (combined_feats['Hoc_Ky'] == 2)
    train_feats_all = combined_feats[train_mask].copy()
    valid_feats_all = combined_feats[valid_mask].copy()
    # PTXT_GROUP Y2+: giữ TOP từ train Y2+, còn lại gán '1' (đồng dạng Y1)
    if 'PTXT_GROUP' in train_feats_all.columns:
        ptxt_counts_y2p = train_feats_all['PTXT_GROUP'].astype(str).value_counts()
        PTXT_TOP_Y2PLUS = ptxt_counts_y2p.head(10).index.tolist()
        for _df in [train_feats_all, valid_feats_all]:
            _df.loc[~_df['PTXT_GROUP'].isin(PTXT_TOP_Y2PLUS), 'PTXT_GROUP'] = '1'
        print(f'  PTXT_TOP Y2+ (10 giá trị phổ biến): {PTXT_TOP_Y2PLUS}')
    print(f'  Train features: {len(train_feats_all)} rows | Valid features: {len(valid_feats_all)} rows')
else:
    train_feats_all = None
    valid_feats_all = None
    PTXT_TOP_Y2PLUS = []

# Test Y2+: một file gộp (giống train_by_year) — dùng test_con_lai.csv, create_features một lần, lưu 1 file có Year_of_Study
test_y2plus_feats_path = os.path.join(DATA_DIR, 'test_Y2plus_features.csv')
if test_con_lai_df is not None and combined_cleaned is not None:
    test_y2plus_raw = test_con_lai_df.copy()
    for col in ['CPA', 'GPA', 'TC_HOANTHANH']:
        if col not in test_y2plus_raw.columns:
            test_y2plus_raw[col] = np.nan
    if 'Nam_Hoc' not in test_y2plus_raw.columns and 'HOC_KY' in test_y2plus_raw.columns:
        test_y2plus_raw['Nam_Hoc'] = test_y2plus_raw['HOC_KY'].astype(str).str.split().str[-1]
    if 'Hoc_Ky' not in test_y2plus_raw.columns and 'HOC_KY' in test_y2plus_raw.columns:
        test_y2plus_raw['Hoc_Ky'] = np.where(test_y2plus_raw['HOC_KY'].astype(str).str.contains('HK1'), 1, 2)
    if 'Year_of_Study' not in test_y2plus_raw.columns:
        _ys = test_y2plus_raw['Nam_Hoc'].astype(str).str.split('-').str[0].astype(float)
        test_y2plus_raw['Year_of_Study'] = (_ys - pd.to_numeric(test_y2plus_raw['NAM_TUYENSINH'], errors='coerce') + 1).clip(lower=1).fillna(1).astype(int)
    test_y2plus_raw['__is_test__'] = 1
    base = combined_cleaned.copy()
    base['__is_test__'] = 0
    combined_plus = pd.concat([base, test_y2plus_raw], ignore_index=True, sort=False)
    combined_plus_feats = create_features_y2plus(combined_plus)
    if 'Year_of_Study' not in combined_plus_feats.columns:
        _ys = combined_plus_feats['Nam_Hoc'].astype(str).str.split('-').str[0].astype(float)
        combined_plus_feats['Year_of_Study'] = (_ys - combined_plus_feats['NAM_TUYENSINH'] + 1).clip(lower=1).fillna(1).astype(int)
    tt = combined_plus_feats[combined_plus_feats['__is_test__'] == 1].copy()
    tt = tt.drop(columns=['__is_test__'], errors='ignore')
    if PTXT_TOP_Y2PLUS and 'PTXT_GROUP' in tt.columns:
        tt.loc[~tt['PTXT_GROUP'].isin(PTXT_TOP_Y2PLUS), 'PTXT_GROUP'] = '1'
    cols = [c for c in FEAT_COLS_Y2PLUS if c in tt.columns]
    out_cols = ['MA_SO_SV', 'Year_of_Study'] + cols
    tt[out_cols].to_csv(test_y2plus_feats_path, index=False)
    print(f'  Test Y2+ (1 file gộp): {test_y2plus_feats_path} ({len(tt)} rows)')

for year in range(2, 8):
    key_train = f'Y{year}'
    key_test = f'Y{year}' if year <= 7 else None
    
    train_final = None
    
    # Train: filter từ train_feats_all theo Year_of_Study
    if train_feats_all is not None and 'Year_of_Study' in train_feats_all.columns:
        tr = train_feats_all[train_feats_all['Year_of_Study'] == year].copy()
        if len(tr) > 0:
            cols = [c for c in FEAT_COLS_Y2PLUS if c in tr.columns]
            train_final = tr[['MA_SO_SV'] + cols].copy()
    elif key_train in train_files:
        # Fallback: tính từ train_files riêng lẻ (có thể thiếu Prev_GPA/Prev_CPA)
        tr = create_features_y2plus(train_files[key_train].copy())
        cols = [c for c in FEAT_COLS_Y2PLUS if c in tr.columns]
        train_final = tr[['MA_SO_SV'] + cols].copy()
    
    if train_final is not None:
        p = os.path.join(DATA_DIR, f'train_Y{year}_features.csv')
        train_final.to_csv(p, index=False)
        print(f'Y{year} train: {p} ({len(train_final)} rows)')
    if train_final is None:
        print(f'Y{year}: không có dữ liệu')

# Valid: tách riêng Y1 (feature khác Y2+) và Y2+ gộp một file
# Valid Y1: valid_Y1_features.csv (MA_SO_SV + FEATS_Y1 + TC_DANGKY + TC_HOANTHANH) — Y1 không có cột Ratio, training sẽ tính từ TC_*
if valid_y1_final is not None and 'Y1' in valid_files:
    y1_df = valid_files['Y1'].set_index('MA_SO_SV')
    y1_rows = y1_df.loc[valid_y1_final['MA_SO_SV']]
    valid_y1_out = valid_y1_final.copy()
    valid_y1_out['TC_DANGKY'] = y1_rows['TC_DANGKY'].values
    valid_y1_out['TC_HOANTHANH'] = y1_rows['TC_HOANTHANH'].values
    p_y1 = os.path.join(DATA_DIR, 'valid_Y1_features.csv')
    valid_y1_out.to_csv(p_y1, index=False)
    print(f'Valid Y1: {p_y1} ({len(valid_y1_out)} rows)')
# Valid Y2+: valid_Y2plus_features.csv (MA_SO_SV + Year_of_Study + FEAT_COLS_Y2PLUS + Ratio + TC_HOANTHANH)
if valid_feats_all is not None and 'Year_of_Study' in valid_feats_all.columns:
    vd = valid_feats_all[valid_feats_all['Year_of_Study'] >= 2].copy()
    if len(vd) > 0:
        tc_dk = vd['TC_DANGKY'].replace(0, np.nan)
        vd['Ratio'] = np.clip((vd['TC_HOANTHANH'] / tc_dk).fillna(0), 0, 1)
        cols = [c for c in FEAT_COLS_Y2PLUS if c in vd.columns]
        valid_y2p = vd[['MA_SO_SV', 'Year_of_Study'] + cols + ['Ratio', 'TC_DANGKY', 'TC_HOANTHANH']].copy()
        p_y2p = os.path.join(DATA_DIR, 'valid_Y2plus_features.csv')
        valid_y2p.to_csv(p_y2p, index=False)
        print(f'Valid Y2+ (1 file): {p_y2p} ({len(valid_y2p)} rows)')

Tính features đầy đủ từ final_combined_cleaned (Y2+ dùng create_features_y2plus)...


  cum_comp = grp['TC_HOANTHANH'].fillna(0).cumsum().shift(1)
  avg_load = grp['TC_HOANTHANH'].fillna(0).cumsum().shift(1) / np.maximum(df['Semester_No'] - 1, 1)


  PTXT_TOP Y2+ (10 giá trị phổ biến): ['1', '100', '409', '500', '3', '402', '5', '200']
  Train features: 90582 rows | Valid features: 15144 rows


  cum_comp = grp['TC_HOANTHANH'].fillna(0).cumsum().shift(1)
  avg_load = grp['TC_HOANTHANH'].fillna(0).cumsum().shift(1) / np.maximum(df['Semester_No'] - 1, 1)


  Test Y2+ (1 file gộp): data/test_Y2plus_features.csv (12176 rows)
Y2 train: data/train_Y2_features.csv (23154 rows)
Y3 train: data/train_Y3_features.csv (20940 rows)
Y4 train: data/train_Y4_features.csv (13848 rows)
Y5 train: data/train_Y5_features.csv (6822 rows)
Y6 train: data/train_Y6_features.csv (791 rows)
Y7: không có dữ liệu
Valid Y1: data/valid_Y1_features.csv (3504 rows)
Valid Y2+ (1 file): data/valid_Y2plus_features.csv (11640 rows)
