In [1]:
import numpy as np
import pandas as pd

In [51]:
df_log=pd.read_csv('/content/logs_preprocessed.csv')
print(df_log.columns)
print(df_log.shape)

Index(['filename', 'dut', 'dut_version', 'os_version', 'config',
       'test_case_id', 'line_number', 'timestamp', 'run_date', 'status',
       'error_msg', 'suite', 'raw_line'],
      dtype='object')
(8605, 13)


In [14]:
df_cluster=pd.read_csv('/content/failure_patterns_labeled_human.csv')
print(df_cluster.columns)
print(df_cluster.shape)

Index(['filename', 'dut', 'dut_version', 'os_version', 'config',
       'test_case_id', 'line_number', 'timestamp', 'run_date', 'status',
       'error_msg', 'suite', 'raw_line', 'failure_freq_suite',
       'failure_freq_dut', 'execution_duration', 'time_since_last_failure',
       'recent_failure_flag', 'config_hash', 'cluster', 'root_cause_label',
       'keywords'],
      dtype='object')
(8605, 22)


In [15]:
df_classified=pd.read_csv('/content/classified_logs.csv')
print(df_classified.columns)
print(df_classified.shape)

Index(['TestCase', 'DUT', 'Suite', 'PredictedLabel', 'ActualStatus',
       'Confidence', 'LowConfidenceFlag', 'ErrorMsg'],
      dtype='object')
(1721, 8)


In [18]:
df_anomaly = pd.read_csv('/content/anomaly_full_scores.csv')
print(df_anomaly.columns)
print(df_anomaly.shape)

Index(['filename', 'dut', 'dut_version', 'os_version', 'config',
       'test_case_id', 'line_number', 'timestamp', 'run_date', 'status',
       'error_msg', 'suite', 'raw_line', 'failure_freq_suite',
       'failure_freq_dut', 'execution_duration', 'time_since_last_failure',
       'recent_failure_flag', 'config_hash', 'cluster', 'root_cause_label',
       'keywords', 'anomaly_score', 'is_anomaly', 'LogID', 'MessageSnippet'],
      dtype='object')
(8605, 26)


In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# ---------------------------------------------------------
# Load all CSVs
# ---------------------------------------------------------
df_logs       = pd.read_csv('/content/logs_preprocessed.csv')
df_cluster    = pd.read_csv('/content/failure_patterns_labeled_human.csv')
df_classified = pd.read_csv('/content/classified_logs.csv')
df_anomaly    = pd.read_csv('/content/anomaly_full_scores.csv')

# Standardize column names: lowercase and strip
# ---------------------------------------------------------
for df_tmp in [df_logs, df_cluster, df_classified, df_anomaly]:
    df_tmp.columns = df_tmp.columns.str.strip().str.lower()




In [40]:
# Rename df_classified columns to match df_logs
# ---------------------------------------------------------
df_classified = df_classified.rename(columns={
    'testcase': 'test_case_id',
    'dut': 'dut',
    'suite': 'suite'
})

In [50]:
# Merge logs + classified
# Use unique identifier columns to avoid Cartesian product
# ---------------------------------------------------------
# Check for uniqueness
print("Rows before merge:", len(df_logs))
merge_keys_clf = ['test_case_id','suite']  # adjust if more unique identifiers exist

df_classified_unique = df_classified.drop_duplicates(subset=merge_keys_clf)

df = df_logs.merge(df_classified_unique,
                   on=merge_keys_clf,
                   how='left',
                   suffixes=('', '_clf'),
                   validate='many_to_one')  # many logs -> 1 classified row

Rows before merge: 8605


In [42]:
# Merge with cluster
# ---------------------------------------------------------
merge_keys_clu = ['test_case_id','suite']
df_cluster_unique = df_cluster.drop_duplicates(subset=merge_keys_clu)

df = df.merge(df_cluster_unique,
              on=merge_keys_clu,
              how='left',
              suffixes=('', '_clu'),
              validate='many_to_one')

In [44]:
# Check columns in anomaly
print(df_anomaly.columns)

# Rename to match df_logs
df_anomaly = df_anomaly.rename(columns={'testcase': 'test_case_id', 'suite': 'suite', 'filename': 'filename'})

merge_keys_anom = [c for c in ['filename','test_case_id','suite'] if c in df.columns and c in df_anomaly.columns]

if merge_keys_anom:
    df = df.merge(df_anomaly[merge_keys_anom + ['anomaly_score','is_anomaly','messagesnippet']],
                  on=merge_keys_anom,
                  how='left')
else:
    print("Skipping anomaly merge: no common columns found")


Index(['filename', 'dut', 'dut_version', 'os_version', 'config',
       'test_case_id', 'line_number', 'timestamp', 'run_date', 'status',
       'error_msg', 'suite', 'raw_line', 'failure_freq_suite',
       'failure_freq_dut', 'execution_duration', 'time_since_last_failure',
       'recent_failure_flag', 'config_hash', 'cluster', 'root_cause_label',
       'keywords', 'anomaly_score', 'is_anomaly', 'logid', 'messagesnippet'],
      dtype='object')


In [45]:
# Convert timestamp to datetime
# ---------------------------------------------------------
df['executiontime'] = pd.to_datetime(df['timestamp'], errors='coerce')

# ---------------------------------------------------------
# Fill missing anomaly values
# ---------------------------------------------------------
df['anomaly_score'] = df.get('anomaly_score', 0).fillna(0)
df['is_anomaly'] = df.get('is_anomaly', 0).fillna(0).astype(int)

# ---------------------------------------------------------
# 1. Test case features
# ---------------------------------------------------------
df['is_fail'] = df['status'].apply(lambda x: 1 if str(x).lower() == "fail" else 0)
df['past_failure_rate'] = df.groupby('test_case_id')['is_fail']\
                            .transform(lambda x: x.shift().expanding().mean()).fillna(0)

df['last_fail_time'] = df[df['is_fail']==1].groupby('test_case_id')['executiontime'].transform('last')
df['time_since_last_failure'] = (df['executiontime'] - df['last_fail_time']).dt.total_seconds().fillna(0)
df['execution_frequency'] = df.groupby('test_case_id').cumcount()

if 'execution_duration' in df.columns:
    df['avg_duration'] = df.groupby('test_case_id')['execution_duration'].transform(lambda x: x.expanding().mean())
else:
    df['avg_duration'] = 0


In [47]:
# 2. Keyword features
# ---------------------------------------------------------
df['messagesnippet'] = df.get('messagesnippet', "").fillna("")
df['keyword_fail'] = df['messagesnippet'].str.contains(r"fail|error|exception|abort", case=False, regex=True).astype(int)

# ---------------------------------------------------------
# 3. Failure cluster
# ---------------------------------------------------------
df['clusterlabel'] = df['cluster'].fillna(-1) if 'cluster' in df.columns else -1

# ---------------------------------------------------------
# 4. Anomaly features
# ---------------------------------------------------------
df['rolling_anomaly_rate'] = df.groupby('test_case_id')['is_anomaly']\
                               .transform(lambda x: x.shift().expanding().mean()).fillna(0)
df['mean_anomaly_score'] = df.groupby('test_case_id')['anomaly_score'].transform(lambda x: x.expanding().mean())

# ---------------------------------------------------------
# 5. Environment / DUT features
# ---------------------------------------------------------
env_cols = ['dut_version','dut','config','regression_group','schedule_cycle']
encoder = LabelEncoder()
for col in env_cols:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')
        df[col] = encoder.fit_transform(df[col].astype(str))



In [48]:
# 6. Metadata features
# ---------------------------------------------------------
df['exec_day'] = df['executiontime'].dt.dayofweek
df['exec_hour'] = df['executiontime'].dt.hour

for col in ['buildversion','suite']:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')
        df[col] = encoder.fit_transform(df[col].astype(str))

# ---------------------------------------------------------
# 7. Target encoding
# ---------------------------------------------------------
df['target'] = df['status'].apply(lambda x: 1 if str(x).lower() == "fail" else 0)

# ---------------------------------------------------------
# Select final ML feature columns
# ---------------------------------------------------------
feature_cols = [
    'test_case_id','executiontime',
    'past_failure_rate','time_since_last_failure','execution_frequency','avg_duration',
    'keyword_fail','clusterlabel','rolling_anomaly_rate','mean_anomaly_score',
    'exec_day','exec_hour','dut_version','dut','config',
    'regression_group','schedule_cycle','buildversion','suite',
    'target'
]
feature_cols = [c for c in feature_cols if c in df.columns]
df_features = df[feature_cols]
# 1. Test case features
# ---------------------------------------------------------
df['is_fail'] = df['status'].apply(lambda x: 1 if str(x).lower() == "fail" else 0)
df['past_failure_rate'] = df.groupby('test_case_id')['is_fail']\
                            .transform(lambda x: x.shift().expanding().mean()).fillna(0)

df['last_fail_time'] = df[df['is_fail']==1].groupby('test_case_id')['executiontime'].transform('last')
df['time_since_last_failure'] = (df['executiontime'] - df['last_fail_time']).dt.total_seconds().fillna(0)
df['execution_frequency'] = df.groupby('test_case_id').cumcount()

if 'execution_duration' in df.columns:
    df['avg_duration'] = df.groupby('test_case_id')['execution_duration'].transform(lambda x: x.expanding().mean())
else:
    df['avg_duration'] = 0

In [49]:
# Save CSV
# ---------------------------------------------------------
df_features.to_csv('/content/feature_engineered_testcases.csv', index=False)
print("Feature Engineering Completed ✔")
print("Saved: feature_engineered_testcases.csv")
print("Final row count:", len(df_features))

Feature Engineering Completed ✔
Saved: feature_engineered_testcases.csv
Final row count: 22137
