# Train notebook is used to present the whole picture of the training process. 

### Step 0: install packages

In [4]:
# python3.11 -m venv .venv
!pip install scikit-learn pandas



Collecting pandas
  Using cached pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.2 pytz-2024.1 tzdata-2024.1


### Step 1: Read the feature.csv file to enable dynamic data preprocessing to adapt to business's need on fast and frequent model feature updates

In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

def load_feature_seeds(file_path):
    feature_seeds = pd.read_csv(file_path, index_col= None)
    feature_seeds['col_name'] = feature_seeds['col_name'].str.lower()
    feature_seeds['normalization_date_column'] = feature_seeds['normalization_date_column'].str.lower()
    return feature_seeds

def clean_numeric_columns(df, feature_seeds):
    numeric_columns = feature_seeds[feature_seeds['data_type'] == 'float']['col_name'].tolist()
    for col in numeric_columns:
        if col in df.columns:
            if df[col].dtype == 'object':
                df[col] = df[col].str.replace(',', '').str.replace('%', '').str.replace('$', '').astype(float, errors='ignore')
        else:
            print(f'Column {col} not found in the training data')
    return df

def apply_fill_method(df, feature_seeds):
    print("Applying fill methods...")
    for _, row in feature_seeds.iterrows():
        col_name = row['col_name']
        fill_method = row['fill_method']
        
        if fill_method == 'constant':
            if col_name in df.columns:
                print(f"Filling missing values in {col_name} with 0")
                df[col_name] = df[col_name].fillna(0)
            else:
                print(f"Warning: Column {col_name} not found for fill method 'constant'")
        elif fill_method == 'previous':
            prev_col_name = col_name.replace('yr1', 'yr0')
            if col_name in df.columns and prev_col_name in df.columns:
                print(f"Filling missing values in {col_name} with values from {prev_col_name}")
                df[col_name] = df[col_name].fillna(df[prev_col_name])
            else:
                print(f"Warning: Column {col_name} or {prev_col_name} not found for fill method 'previous'")
        # Add more fill methods as needed
    return df

def date_string_to_day(x, level):
    x = pd.to_datetime(x)
    if level == 'month':
        return x.day
    elif level == 'year':
        return x.dayofyear
    return x

def apply_normalization(df, feature_seeds):
    print("Applying normalization...")
    for _, row in feature_seeds.iterrows():
        col_name = row['col_name']
        date_col = row['normalization_date_column']
        level = row['normalization_level']
        if pd.notna(date_col):
            if col_name in df.columns and date_col in df.columns:
                print(f"Normalizing {col_name} based on {date_col} at {level} level")
                try:
                    df['day'] = df[date_col].apply(lambda x: date_string_to_day(x, level))
                    df[col_name] = df[col_name] / df['day']
                except Exception as e:
                    print(f"Error normalizing {col_name} based on {date_col}: {e}")
            else:
                print(f"Warning: Column {col_name} or {date_col} not found for normalization")
    df.drop(columns=['day'], errors='ignore', inplace=True)
    return df

def apply_descriptive_features(df, feature_seeds):
    print("Applying descriptive features...")
    for _, row in feature_seeds.iterrows():
        descriptive_syntax = row['descriptive_feature_syntax']
        if pd.notna(descriptive_syntax):
            col_name = row['col_name']
            # Make sure column names in the descriptive_syntax are also in lowercase
            for col in df.columns:
                descriptive_syntax = descriptive_syntax.replace(col, col.lower())
            try:
                print(f"Generating descriptive feature {col_name} using syntax: {descriptive_syntax}")
                df[col_name] = eval(descriptive_syntax)
            except Exception as e:
                print(f"Error generating descriptive feature {col_name}: {e}")
    return df

# How it should be used

def preprocessing(df, feature_seeds_path):
    feature_seeds = load_feature_seeds(feature_seeds_path)
    df.columns = df.columns.str.lower()
    df = clean_numeric_columns(df,feature_seeds)
    df = apply_fill_method(df, feature_seeds)
    df = apply_normalization(df, feature_seeds)
    # df = apply_descriptive_features(df, feature_seeds)
    return df




feature_seeds_path = 'features_seed.csv'
input_file_path = 'train.csv'


data = pd.read_csv(input_file_path, index_col= 0)
processed_data = preprocessing(data, feature_seeds_path)



  data = pd.read_csv(input_file_path, index_col= 0)


Applying fill methods...
Filling missing values in quote with 0
Filling missing values in consumer_credit_judgements_guar1 with 0
Filling missing values in consumer_credit_insolvency_notices_guar1 with 0
Filling missing values in consumer_credit_credit_defaults_guar1 with 0
Filling missing values in consumer_credit_company_affiliations_guar1 with 0
Filling missing values in consumer_credit_file_activity_guar1 with 0
Filling missing values in consumer_credit_score_guar1 with 0
Filling missing values in consumer_credit_risk_odds_guar1 with 0
Filling missing values in consumer_credit_judgements_guar2 with 0
Filling missing values in consumer_credit_insolvency_notices_guar2 with 0
Filling missing values in consumer_credit_credit_defaults_guar2 with 0
Filling missing values in consumer_credit_company_affiliations_guar2 with 0
Filling missing values in consumer_credit_file_activity_guar2 with 0
Filling missing values in consumer_credit_score_guar2 with 0
Filling missing values in consumer_cr

## Training steps

In [5]:
# processed_data is here
processed_data.head()

Unnamed: 0,quote,consumer_credit_judgements_guar1,consumer_credit_insolvency_notices_guar1,consumer_credit_credit_defaults_guar1,consumer_credit_company_affiliations_guar1,consumer_credit_file_activity_guar1,consumer_credit_score_guar1,consumer_credit_risk_odds_guar1,consumer_credit_judgements_guar2,consumer_credit_insolvency_notices_guar2,...,total_non_current_assets,total_assets,accounts_payable,total_current_liabilities,total_non_current_liabilities,total_liabilities,net_assets,current_year_earnings,retained_earnings,total_equity
0,FT13365,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,617160.18,77774.38,215729.26,170786.39,386515.65,230644.53,230644.53,0.0,230644.53
1,FT13364,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,617160.18,77774.38,215729.26,170786.39,386515.65,230644.53,230644.53,0.0,230644.53
2,FT13363,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,617160.18,77774.38,215729.26,170786.39,386515.65,230644.53,230644.53,0.0,230644.53
3,FT13362,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,617160.18,77774.38,215729.26,170786.39,386515.65,230644.53,230644.53,0.0,230644.53
4,FT13361,0.0,0.0,0.0,2.0,8.0,621,0.13,0.0,0.0,...,0.0,2295.84,278.71,37348.93,-21257.08,16091.85,-13796.01,6210.66,-20006.67,-13796.01


1. Edit the feature seed csv file to indicate which column should be used for training
2. select 10 features for now for testing 
3. run all training steps on 10 features
4. increase the feature numbers to 11 and check everything can still run.