# 1. Import and configure

In [None]:
import os
import pandas as pd
import numpy as np
import logging
from src.data.processing import load_data, process_status, summarize_projects, fill_nan_values, encode_categorical_columns, normalize_numerical_columns, encode_cyclical_time_features, save_projects_to_files, drop_low_importance_features, add_build_features
from src.data.visualization import plot_line, plot_pie, plot_multi_project, plot_feature_importance
from src.data.feature_analysis import prepare_features, print_nan_columns, aggregate_feature_importance
# Import mới

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 2. Load and summarize data

In [None]:
combined_df = load_data("../data/combined/combined_travistorrent.csv")

combined_df = process_status(combined_df, 'tr_status').copy()
summary_df = summarize_projects(combined_df, min_rows=50000, balance_threshold=0.7)
display(summary_df)
selected_projects = summary_df['project'].head(10).tolist()
dfs = {project: data for project, data in combined_df.groupby('gh_project_name')}

# 3. Analyze NaN values

In [None]:
selected_dfs = [dfs[project] for project in selected_projects if project in dfs]
selected_projects_df = pd.concat(selected_dfs, ignore_index=True)
nan_ratios = selected_projects_df.isna().mean().sort_values(ascending=False)
print("Tỷ lệ NaN trên toàn bộ project:")
display(nan_ratios[nan_ratios > 0])

nan_per_row = selected_projects_df.isna().sum(axis=1)
nan_counts = nan_per_row.value_counts().sort_index()
table_df = pd.DataFrame({
    'NaN columns number': nan_counts.index,
    'Row number': nan_counts.values,
    'Ratio (%)': (nan_counts / len(selected_projects_df) * 100).round(2)
})
print("\nTable of ratio of rows by number of NaN columns:")
display(table_df)
plot_pie(table_df, 'NaN columns number', 'Ratio of rows to number of columns NaN')

# 4. Visualize feature

## 4.1. For 1 project

In [None]:
project = 'DataDog/dd-agent'
df = dfs[project].copy()
for col in ['tr_duration', 'gh_num_issue_comments', 'gh_num_pr_comments', 'tr_log_num_tests_failed']:
    plot_line(df, col, project)

## 4.2. For all project

In [None]:
selected_projects_df['gh_by_core_team_member'] = selected_projects_df['gh_by_core_team_member'].replace({
    'True': 1.0, '1.0': 1.0,
    'False': 0.0, '0.0': 0.0
}).astype(float).copy()
print("Phân tích feature trên tất cả dự án:")
columns_to_plot = selected_projects_df.columns[selected_projects_df.isna().any()].tolist()
for col in columns_to_plot:
    plot_multi_project(selected_projects_df, col)

# 5. Processing

In [None]:
sorted_df = selected_projects_df.sort_values(by=['gh_project_name', 'gh_build_started_at']).copy()
sorted_df = sorted_df[sorted_df['tr_status'].isin(['failed', 'errored', 'passed'])]
sorted_df['build_failed'] = sorted_df['tr_status'].map({'passed': 0, 'failed': 1, 'errored': 1})
df_notnan = fill_nan_values(sorted_df)
print_nan_columns(df=df_notnan)

# 6. Add new feature and merge job values

## 6.1 Add new feature

In [None]:
new_feature_df = add_build_features(df_notnan)
print_nan_columns(df=new_feature_df)

In [None]:
# Nhóm các dòng theo tr_build_id có nhiều hơn 1 dòng
grouped = new_feature_df.groupby('tr_build_id')

# Lọc các nhóm có hơn 1 dòng và không hoàn toàn giống nhau
non_identical_duplicates = []

for build_id, group in grouped:
    if len(group) > 1:
        # Kiểm tra xem tất cả các dòng trong group có giống nhau không
        if not group.drop(columns='tr_build_id').duplicated().all():
            non_identical_duplicates.append(group)

# Ghép lại thành một DataFrame nếu có dòng thoả điều kiện
if non_identical_duplicates:
    result_df = pd.concat(non_identical_duplicates)
    result_df.head(10) # In ra 10 dòng đầu tiên (hoặc bao nhiêu tuỳ bạn)
else:
    print("Không có dòng nào có cùng tr_build_id mà khác nội dung.")

In [None]:
result_df.head(10)

In [None]:
merged_df = new_feature_df.groupby(['gh_project_name', 'gh_build_started_at', 'build_failed', 'tr_build_id'], as_index=False).agg({
    'gh_num_issue_comments': 'sum', 'gh_num_pr_comments': 'sum', 'gh_team_size': 'mean', 'gh_sloc': 'mean',
    'git_diff_src_churn': 'sum', 'git_diff_test_churn': 'sum', 'gh_diff_files_added': 'sum',
    'gh_diff_files_deleted': 'sum', 'gh_diff_files_modified': 'sum', 'gh_diff_tests_added': 'sum',
    'gh_diff_tests_deleted': 'sum', 'gh_diff_src_files': 'sum', 'gh_diff_doc_files': 'sum',
    'gh_diff_other_files': 'sum', 'gh_num_commits_on_files_touched': 'sum', 'gh_test_lines_per_kloc': 'mean',
    'gh_test_cases_per_kloc': 'mean', 'gh_asserts_cases_per_kloc': 'mean', 'gh_is_pr': 'max',
    'gh_by_core_team_member': 'max', 'gh_num_commit_comments': 'sum', 'tr_log_num_tests_failed': 'sum',
    'tr_duration': 'max',
    # Merge các feature mới
    'year_of_start': 'first', 'month_of_start': 'first', 'day_of_start': 'first', 'hour_of_start': 'first',
    'elapsed_days_last_build': 'first', 'same_committer': 'max', 'proj_fail_rate_history': 'mean',
    'proj_fail_rate_recent': 'mean', 'comm_fail_rate_history': 'mean', 'comm_fail_rate_recent': 'mean',
    'comm_avg_experience': 'mean', 'no_config_edited': 'max',
    'num_files_edited': 'sum', 'num_distinct_authors': 'max', 'prev_build_result': 'first', 'day_week': 'first'
})
merged_df.drop_duplicates(inplace=True)
print_nan_columns(df=merged_df)

# 8. Encoding

In [None]:
categorical_columns = []
cyclical_time_columns = []
periods = {"month_of_start": 12, "day_of_start": 31, "hour_of_start": 24, "day_week": 7}
numerical_columns = [
    "gh_num_issue_comments", "gh_num_pr_comments", "gh_team_size", "gh_sloc",
    "git_diff_src_churn", "git_diff_test_churn", "gh_diff_files_added", "gh_diff_files_deleted",
    "gh_diff_files_modified", "gh_diff_tests_added", "gh_diff_tests_deleted", "gh_diff_src_files",
    "gh_diff_doc_files", "gh_diff_other_files", "gh_num_commits_on_files_touched",
    "gh_test_lines_per_kloc", "gh_test_cases_per_kloc", "gh_asserts_cases_per_kloc",
    "gh_num_commit_comments",
    "tr_log_num_tests_failed", "tr_duration",
    "year_of_start", "elapsed_days_last_build", "proj_fail_rate_history", "proj_fail_rate_recent",
    "comm_fail_rate_history", "comm_fail_rate_recent", "comm_avg_experience",
    "num_files_edited", "num_distinct_authors",
    "month_of_start", "day_of_start", "hour_of_start", "day_week"
]
trans_df = merged_df.copy()
trans_df_encoded, _ = encode_categorical_columns(trans_df, categorical_columns)
trans_df_cyclical = encode_cyclical_time_features(trans_df_encoded, cyclical_time_columns, periods)
trans_df_processed, _ = normalize_numerical_columns(trans_df_cyclical, numerical_columns)

# 9. Analyze features importance

In [None]:
trans_df_processed.columns

In [None]:
trans_df_processed.drop(columns=['tr_log_num_tests_failed'], inplace=True)

In [None]:
print("Analysis of the importance of features for 'build_failed':")
X, y = prepare_features(trans_df_processed, target_column='build_failed')
importance_df = aggregate_feature_importance(X, y)
plot_feature_importance(importance_df)

# !Remove low importance features (optional)

In [None]:
final_df, list = drop_low_importance_features(X=trans_df_processed, importance_df=importance_df, threshold=0.005)
print_nan_columns(df=final_df)

# 10. Save processed data

In [None]:
saved_files = save_projects_to_files(final_df, '../data/processed-local', 'gh_project_name')

In [None]:
final_df.columns

In [None]:
final_df.gh_project_name.unique()
summarize_projects(final_df, min_rows=0, balance_threshold=1)