In [None]:
import joblib
import pandas as pd
import numpy as np
import os

In [None]:
data = pd.read_csv('../Data/datos_sinteticos_inferencia2.csv')

In [None]:
# Definiendo variables nan y categoricas
var_nan = ['prev_address_months_count','current_address_months_count','intended_balcon_amount',
           'bank_months_count','session_length_in_minutes','device_distinct_emails_8w']

# Ingenieria de variables
data[var_nan] = data[var_nan].replace(-1, np.nan).astype('float')

data['prev_address_valid'] = np.where(data['prev_address_months_count'] > 0,1,0)
data['velocity_6h'] = np.where(data['velocity_6h'] <= 0,data["velocity_6h"].quantile(0.25),data["velocity_6h"])
data['ratio_velocity_6h_24h'] = data['velocity_6h']/data['velocity_24h']
data['ratio_velocity_24h_4w'] = data['velocity_24h']/data['velocity_4w']
data['log_bank_branch_count_8w'] = np.log1p(data['bank_branch_count_8w'])
data['log_days_since_request'] = np.log1p(data['days_since_request'])
data['prev_bank_months_count'] = np.where(data['bank_months_count'] <=0, 0, 1)
data['income_risk_score'] = data['income']*data['credit_risk_score']
data['rel_income_credit'] = data['income'] / data ['proposed_credit_limit']
data['age_at_account_opening'] = data['customer_age'] - (data['bank_months_count'] / 12)
data['credit_per_income'] = data['proposed_credit_limit'] / data['income']
data['zip_branch_ratio'] = data['zip_count_4w'] / (data['bank_branch_count_8w'] +1)
data['is_young_high_credit'] = np.where((data['customer_age'] < 30) & (data['proposed_credit_limit'] > 1700), 1, 0)
data['is_high_risk_low_income'] = np.where((data['credit_risk_score'] > 200) & (data['income'] < 0.3), 1, 0)


data = data.drop(columns = ['device_fraud_count','month','prev_address_months_count',
                            # 'intended_balcon_amount', 'source'
                            ])

In [None]:
encoder = joblib.load('../Data/encoder.joblib')
tuned_model = joblib.load('../Data/tuned_model.joblib')

In [None]:
cat_features = data.select_dtypes(include=['object']).columns.tolist()

encoder_features_test = encoder.transform(data[cat_features])
data_encode = pd.DataFrame(encoder_features_test, columns=encoder.get_feature_names_out(cat_features))
data_f = pd.concat([data.drop(columns=cat_features), data_encode], axis=1)

In [None]:
# Realizar inferencias
y_pred_proba = tuned_model.predict_proba(data_f)[:, 1]

t_low_opt = 0.039230769230769236
t_high_opt = 0.0576271186440678

data_f['proba'] = y_pred_proba
data_f['category'] = "NO_FRAUDE" 
data_f.loc[(y_pred_proba < t_high_opt) & (y_pred_proba >= t_low_opt), 'category'] = "REVISIÓN"
data_f.loc[y_pred_proba >= t_high_opt, 'category'] = "FRAUDE"

current_time = pd.to_datetime('now').floor('s')
data_f['date'] = current_time

# Solo para agregar una hora y simular que los datos no son todos del mismo instante
data_f['date'] = data_f['date'] + pd.to_timedelta(1, unit='h')



data_f['date'] = data_f['date'].dt.strftime('%Y-%m-%d %H:%M:%S')

results_df = pd.DataFrame({
    'proba': data_f['proba'],
    'category': data_f['category'],
    't_high_opt': t_high_opt,
    't_low_opt': t_low_opt,
    'date': data_f['date']
})

In [None]:
results_df.to_csv('../Data/inference_results2.csv', index=False)

In [None]:
data_r = pd.read_csv('../Data/inference_results.csv')
data_r2 = pd.read_csv('../Data/inference_results2.csv')

data_rf = pd.concat([data_r, data_r2], axis=0)
data_rf.to_csv('../Data/inference_results_full.csv', index=False)