In [None]:
import pandas as pd
import glob
import os
from collections import defaultdict

## 1. Get columns

In [None]:
folder_path = "../data/raw/"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
column_groups = defaultdict(list)

# Phân loại file theo cấu trúc cột
for file_path in csv_files:
    try:
        df = pd.read_csv(file_path, nrows=0)  # Chỉ đọc header để lấy tên cột
        columns = tuple(df.columns.tolist())
        column_groups[columns].append(os.path.basename(file_path))
    except Exception as e:
        print(f"Lỗi đọc {file_path}: {e}")

groups_files = []

for i, files in enumerate(column_groups.values(), start=1):
    full_paths = [os.path.join(folder_path, f) for f in files]
    groups_files.append(full_paths)
    print(f"group{i}_files = {full_paths}  # {len(files)} files")

## 2. Mapping

In [None]:
# Danh sách 25 cột mục tiêu
target_columns = [
    "tr_status",
    "gh_project_name",
    "gh_is_pr",
    "gh_team_size",
    "git_num_commits",
    "gh_num_issue_comments",
    "gh_num_commit_comments",
    "gh_num_pr_comments",
    "git_diff_src_churn",
    "git_diff_test_churn",
    "gh_diff_files_added",
    "gh_diff_files_deleted",
    "gh_diff_files_modified",
    "gh_diff_tests_added",
    "gh_diff_tests_deleted",
    "gh_diff_src_files",
    "gh_diff_doc_files",
    "gh_diff_other_files",
    "gh_num_commits_on_files_touched",
    "gh_sloc",
    "gh_test_lines_per_kloc",
    "gh_test_cases_per_kloc",
    "gh_asserts_cases_per_kloc",
    "gh_by_core_team_member",
    "gh_build_started_at"
]

# Ánh xạ tên cột từ các nhóm sang tên cột mục tiêu
mappings = {
    "Group1": {  # Từ Group 1 (1274 file)
        "tr_status": "tr_status",
        "gh_project_name": "gh_project_name",
        "gh_is_pr": "gh_is_pr",
        "gh_team_size": "gh_team_size",
        "git_num_commits": "gh_num_commits_in_push",  # Số commits trong push tương đương
        "gh_num_issue_comments": "gh_num_issue_comments",
        "gh_num_commit_comments": "gh_num_commit_comments",
        "gh_num_pr_comments": "gh_num_pr_comments",
        "git_diff_src_churn": "git_diff_src_churn",
        "git_diff_test_churn": "git_diff_test_churn",
        "gh_diff_files_added": "gh_diff_files_added",
        "gh_diff_files_deleted": "gh_diff_files_deleted",
        "gh_diff_files_modified": "gh_diff_files_modified",
        "gh_diff_tests_added": "gh_diff_tests_added",
        "gh_diff_tests_deleted": "gh_diff_tests_deleted",
        "gh_diff_src_files": "gh_diff_src_files",
        "gh_diff_doc_files": "gh_diff_doc_files",
        "gh_diff_other_files": "gh_diff_other_files",
        "gh_num_commits_on_files_touched": "gh_num_commits_on_files_touched",
        "gh_sloc": "gh_sloc",
        "gh_test_lines_per_kloc": "gh_test_lines_per_kloc",
        "gh_test_cases_per_kloc": "gh_test_cases_per_kloc",
        "gh_asserts_cases_per_kloc": "gh_asserts_cases_per_kloc",
        "gh_by_core_team_member": "gh_by_core_team_member",
        "gh_build_started_at": "gh_build_started_at"
    },
    "Group2": {  # Từ Group 2 (travistorrent-2015.csv)
        "tr_status": "status",
        "gh_project_name": "project_name",
        "gh_is_pr": "is_pr",
        "gh_team_size": "team_size",
        "git_num_commits": "num_commits",
        "gh_num_issue_comments": "num_issue_comments",
        "gh_num_commit_comments": "num_commit_comments",
        "gh_num_pr_comments": "num_pr_comments",
        "git_diff_src_churn": "src_churn",
        "git_diff_test_churn": "test_churn",
        "gh_diff_files_added": "files_added",
        "gh_diff_files_deleted": "files_deleted",
        "gh_diff_files_modified": "files_modified",
        "gh_diff_tests_added": "tests_added",
        "gh_diff_tests_deleted": "tests_deleted",
        "gh_diff_src_files": "src_files",
        "gh_diff_doc_files": "doc_files",
        "gh_diff_other_files": "other_files",
        "gh_num_commits_on_files_touched": "commits_on_files_touched",
        "gh_sloc": "sloc",
        "gh_test_lines_per_kloc": "test_lines_per_kloc",
        "gh_test_cases_per_kloc": "test_cases_per_kloc",
        "gh_asserts_cases_per_kloc": "asserts_per_kloc",
        "gh_by_core_team_member": "main_team_member",
        "gh_build_started_at": "started_at"
    },
    "Group3": {  # Từ Group 3 (travistorrent-2016.csv)
        "tr_status": "tr_status",
        "gh_project_name": "gh_project_name",
        "gh_is_pr": "gh_is_pr",
        "gh_team_size": "gh_team_size",
        "git_num_commits": "git_num_commits",
        "gh_num_issue_comments": "gh_num_issue_comments",
        "gh_num_commit_comments": "gh_num_commit_comments",
        "gh_num_pr_comments": "gh_num_pr_comments",
        "git_diff_src_churn": "gh_src_churn",
        "git_diff_test_churn": "gh_test_churn",
        "gh_diff_files_added": "gh_files_added",
        "gh_diff_files_deleted": "gh_files_deleted",
        "gh_diff_files_modified": "gh_files_modified",
        "gh_diff_tests_added": "gh_tests_added",
        "gh_diff_tests_deleted": "gh_tests_deleted",
        "gh_diff_src_files": "gh_src_files",
        "gh_diff_doc_files": "gh_doc_files",
        "gh_diff_other_files": "gh_other_files",
        "gh_num_commits_on_files_touched": "gh_commits_on_files_touched",
        "gh_sloc": "gh_sloc",
        "gh_test_lines_per_kloc": "gh_test_lines_per_kloc",
        "gh_test_cases_per_kloc": "gh_test_cases_per_kloc",
        "gh_asserts_cases_per_kloc": "gh_asserts_cases_per_kloc",
        "gh_by_core_team_member": "gh_by_core_team_member",
        "gh_build_started_at": "tr_started_at"  # Dùng tr_started_at thay vì gh_build_started_at
    },
    "Group4": {  # Từ Group 4 (travistorrent-2017.csv)
        "tr_status": "tr_status",
        "gh_project_name": "gh_project_name",
        "gh_is_pr": "gh_is_pr",
        "gh_team_size": "gh_team_size",
        "git_num_commits": "gh_num_commits_in_push",
        "gh_num_issue_comments": "gh_num_issue_comments",
        "gh_num_commit_comments": "gh_num_commit_comments",
        "gh_num_pr_comments": "gh_num_pr_comments",
        "git_diff_src_churn": "git_diff_src_churn",
        "git_diff_test_churn": "git_diff_test_churn",
        "gh_diff_files_added": "gh_diff_files_added",
        "gh_diff_files_deleted": "gh_diff_files_deleted",
        "gh_diff_files_modified": "gh_diff_files_modified",
        "gh_diff_tests_added": "gh_diff_tests_added",
        "gh_diff_tests_deleted": "gh_diff_tests_deleted",
        "gh_diff_src_files": "gh_diff_src_files",
        "gh_diff_doc_files": "gh_diff_doc_files",
        "gh_diff_other_files": "gh_diff_other_files",
        "gh_num_commits_on_files_touched": "gh_num_commits_on_files_touched",
        "gh_sloc": "gh_sloc",
        "gh_test_lines_per_kloc": "gh_test_lines_per_kloc",
        "gh_test_cases_per_kloc": "gh_test_cases_per_kloc",
        "gh_asserts_cases_per_kloc": "gh_asserts_cases_per_kloc",
        "gh_by_core_team_member": "gh_by_core_team_member",
        "gh_build_started_at": "gh_build_started_at"
    }
}

dtype_spec = {
    "git_diff_src_churn": "float32",
    "gh_diff_files_modified": "float32",
    "gh_test_lines_per_kloc": "float32"
}

## 3. Combine

In [None]:
def process_files(file_list, group_name, mapping):
    """
    Đọc và xử lý các file trong danh sách file theo nhóm.

    Args:
        file_list (list): Danh sách đường dẫn tới các file CSV
        group_name (str): Tên nhóm (Group1, Group2, ...)
        mapping (dict): Ánh xạ từ tên cột cũ sang tên cột mới

    Returns:
        list: Danh sách các DataFrame đã xử lý
    """
    dfs = []
    for file in file_list:
        try:
            df = pd.read_csv(file, dtype=dtype_spec, low_memory=False)
            df = df.rename(columns={v: k for k, v in mapping[group_name].items()})
            available_columns = [col for col in target_columns if col in df.columns]
            if available_columns:
                df = df[available_columns]
                dfs.append(df)
            else:
                print(f"Warning: No target columns found in {file}")
        except Exception as e:
            print(f"Error processing {file}: {e}")
    return dfs

group_dfs = []
for i, group_files in enumerate(groups_files, start=1):
    group_name = f"Group{i}"
    dfs = process_files(group_files, group_name, mappings)
    group_dfs.append(dfs)
    print(f"Processed {len(dfs)} DataFrames for {group_name}")
all_dfs = []
for dfs in group_dfs:
    all_dfs.extend(dfs)

all_dfs = [
    df.dropna(how='all', axis=1)
    for df in all_dfs
    if not df.empty and not df.isna().all().all()
]
combined_df = []
if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    combined_df = combined_df[[col for col in target_columns if col in combined_df.columns]]

    print("Kích thước DataFrame cuối cùng:", combined_df.shape)
    print("Các cột trong DataFrame:", combined_df.columns.tolist())
else:
    print("Không có DataFrame nào được xử lý thành công.")

In [None]:
    combined_df.to_csv("../data/combined/combined_travistorrent.csv", index=False)