# LB 0.6570781

In [1]:
!pip install -r requirements.txt
import pandas as pd
import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

Collecting polars (from -r requirements.txt (line 2))
  Obtaining dependency information for polars from https://files.pythonhosted.org/packages/f6/c7/412912cc735bec03de751e506c3380ae393032f2e786e2a93d160acbf1dd/polars-0.20.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached polars-0.20.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting mlflow (from -r requirements.txt (line 4))
  Obtaining dependency information for mlflow from https://files.pythonhosted.org/packages/b6/ae/06a299c661262aad52997e07d0e5fae5b5682401afc061fdb7c8f2103780/mlflow-2.10.0-py3-none-any.whl.metadata
  Using cached mlflow-2.10.0-py3-none-any.whl.metadata (13 kB)
Collecting kaleido>=0.2.1 (from pycaret->-r requirements.txt (line 3))
  Using cached kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
Collecting xxhash (from pycaret->-r requirements.txt (line 3))
  Obtaining dependency information for xxhash from https://files.pythonhosted.org/

In [2]:
df = pl.read_csv('train.csv')
test = pl.read_csv('test.csv')
new_index = range(42307, 42307 + len(test))
test.index = new_index

In [3]:
df = df.with_columns(pl.lit(0).alias('Adv'))
test = test.with_columns(pl.lit(1).alias('Adv'),
                         pl.lit(1).alias('MIS_Status'))

test = test.select(df.columns)
test = test.with_columns(test['MIS_Status'].cast(pl.Int64))
df_test = pl.concat([df, test])

In [4]:
def Feature_Engineering(df):
    code_dict = {32: 31, 33: 31, 45: 44, 49: 48}
    df = df.with_columns(pl.col("Sector").replace(code_dict))

    revline_dict = {'0': None, 'T': None}
    lowdoc_dict = {'C': None, '0': None, 'S': None, 'A': None}
    df = df.with_columns([
        pl.col("RevLineCr").replace(revline_dict),
        pl.col("LowDoc").replace(lowdoc_dict)
    ])

    df = df.with_columns([
        pl.col('DisbursementDate').str.strptime(pl.Date, '%d-%b-%y').alias('DisbursementDate'),
        pl.col('DisbursementDate').str.strptime(pl.Date, '%d-%b-%y').dt.year().alias('DisbursementYear')
    ])

    cols = ["DisbursementGross", "GrAppv", "SBA_Appv"]
    for col in cols:
        df = df.with_columns(
            pl.col(col).str.replace_all('[$,]', '').str.strip().cast(pl.Float64).alias(col)
        )

    df = df.with_columns([
        (pl.col("ApprovalFY") - pl.col("DisbursementYear")).alias("FY_Diff"),
        (pl.col("State") == pl.col("BankState")).cast(pl.UInt8).alias("State_is_BankState"),
        (pl.col('SBA_Appv') / pl.col('GrAppv')).alias('SBA_Portion'),
        (pl.col("DisbursementGross") / pl.col("GrAppv")).alias("DisbursementGrossRatio"),
        (pl.col("GrAppv") / pl.when(pl.col("Term") == 0).then(1).otherwise(pl.col("Term"))).alias("MonthlyRepayment"),pl.col("Term")
    ])
    
    null_count = sum([pl.col(col).is_null().cast(pl.Int32) for col in df.columns])
    df = df.with_columns(null_count.alias("NullCount"))
    
    return df

def fit_transform(df, object_columns):
    mappings = {}
    counts = {}
    for col in object_columns:
        encoder = LabelEncoder()

        new_col_name_label = f'{col}_LabelEncoded'
        df[new_col_name_label] = encoder.fit_transform(df[col])
        mappings[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))


        new_col_name_count = f'{col}_CountEncoded'
        df[new_col_name_count] = df[col].map(df[col].value_counts())
        counts[col] = df[col].value_counts().to_dict()

        df.drop(col, axis=1, inplace=True)

    return df, mappings, counts

def transform(df, mappings, counts):
    for col, mapping in mappings.items():
        new_col_name_label = f'{col}_LabelEncoded'
        new_col_name_count = f'{col}_CountEncoded'

        df[new_col_name_label] = df[col].map(mapping).fillna(-1)

        df[new_col_name_count] = df[col].map(counts[col]).fillna(0)
        df.drop(col, axis=1, inplace=True)
    return df

In [5]:
def Preprocessing(df):
    df = Feature_Engineering(df)
    df = df.to_pandas()
    
    object_columns = df.select_dtypes(include=['object']).columns
    df_encoded, mappings,counts = fit_transform(df, object_columns)
    
    df_encoded.drop('DisbursementDate',axis=1,inplace=True)
    return df_encoded

In [6]:
df_test = Preprocessing(df_test)
df = df_test[df_test['Adv']==0]
test = df_test[df_test['Adv']==1]
X = df.drop(['MIS_Status','Adv'],axis=1)
y = df['MIS_Status']
test.drop(['MIS_Status','Adv'],axis=1,inplace=True)

In [7]:
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(zip(np.unique(y), weights))

In [8]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import statistics
from scipy.stats import mode

average_f1 = []
average_roc_auc_score = []
predictions = []
skf = StratifiedKFold(n_splits=5)
for train_idx, val_idx in skf.split(X,y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    params = {
        'objective': 'binary',
        'metric': 'f1',
        'class_weight': class_weights,
        "n_estimators": 3000,
        "learning_rate": 0.01,
        "colsample_bytree": 0.8,
        "subsample_freq": 1,
        "subsample": 0.8,
        "random_seed": 0,
        'verbose': -1,
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    y_pred = model.predict(X_val)
    mean_f1 = f1_score(y_val, y_pred, average='macro')
    roc_auc = roc_auc_score(y_val, y_pred)
    average_f1.append(mean_f1)
    average_roc_auc_score.append(roc_auc)
    feature_gains = model.booster_.feature_importance(importance_type='gain')
    feature = pd.DataFrame({'feature': X.columns, 'importance': feature_gains})
    feature.sort_values(by='importance',ascending=False)
    print(feature)
    print(f"Mean F1 Score (Macro F1 Score) : {mean_f1}")
    print(f"ROC_AUC_Score : {roc_auc}")
    
    
average_f1 = statistics.mean(average_f1)
average_roc_auc_score = statistics.mean(average_roc_auc_score)
print(' ')
print(f'Average_f1marco : {average_f1}')
print(f'Average_roc_auc : {average_roc_auc_score}')

                      feature     importance
0                        Term   77026.438816
1                       NoEmp   93465.986744
2                    NewExist    7401.951601
3                   CreateJob   34271.321465
4                 RetainedJob   37634.623759
5               FranchiseCode   21985.213455
6                      Sector   35252.883104
7                  ApprovalFY   38392.463160
8           DisbursementGross   47000.293903
9                      GrAppv   31176.493330
10                   SBA_Appv   47178.552603
11                 UrbanRural  128765.680707
12           DisbursementYear   50156.283157
13                    FY_Diff   52151.820262
14         State_is_BankState    5819.080092
15                SBA_Portion   19951.473368
16     DisbursementGrossRatio   23900.880330
17           MonthlyRepayment   63228.507216
18                  NullCount   15924.134423
19     RevLineCr_LabelEncoded   12949.658800
20     RevLineCr_CountEncoded    8973.571033
21        

In [9]:
params = {
        'objective': 'binary',
        'metric': 'f1',
        'class_weight': class_weights,
        "n_estimators": 3000,
        "learning_rate": 0.01,
        "colsample_bytree": 0.8,
        "subsample_freq": 1,
        "subsample": 0.8,
        "random_seed": 0,
        'verbose': -1,
}

final_model = lgb.LGBMClassifier(**params)
final_model.fit(X_train, y_train)
pred = final_model.predict(test)

In [10]:
new_index = range(42307, 42307 + len(test))
test.index = new_index
predictions = pd.DataFrame({'Id': test.index, 'predict': pred})
predictions.to_csv('predictions_file.csv',index=False,header=False)