# Analísis de los datos

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import joblib

In [2]:
df = pd.read_csv('../data/transactions.csv')
df.head()

Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,trans_amount,first_name,last_name,gender,street,...,latitude,longitude,city_population,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,733500,2019-11-09 21:14:17,30029052116970,"fraud_Little, Gutmann and Lynch",shopping_net,2.03,Curtis,Young,M,4319 Watson Shoals Suite 658,...,42.1239,-79.1895,3833,Metallurgist,1970-10-09,d5ef70cb0da402c0d32f61f0a8f5b45b,1352495657,42.962553,-78.427227,False
1,733501,2019-11-09 21:14:49,3596217206093829,fraud_Rippin-VonRueden,health_fitness,7.06,Sara,Ramirez,F,23843 Scott Island,...,40.8626,-91.9534,888,Camera operator,1988-03-25,c5056783046b08a075de3a0493c9e162,1352495689,41.78898,-91.348646,False
2,733502,2019-11-09 21:15:01,3526826139003047,fraud_Kuhn Group,food_dining,5.07,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,2dd84fd4dd5aab4cf61e5c644b8e50dc,1352495701,43.679707,-84.268857,False
3,733503,2019-11-09 21:15:30,38052002992326,fraud_Roberts-Beahan,misc_pos,9.96,Anna,Logan,F,3522 Park Wells Suite 528,...,30.33,-95.0202,34153,Building surveyor,1995-09-11,8530be4f5b1a75780c481e57553d5a69,1352495730,31.026641,-95.316548,False
4,733504,2019-11-09 21:15:54,374930071163758,fraud_Abbott-Rogahn,entertainment,14.66,Daniel,Escobar,M,61390 Hayes Port,...,42.2203,-83.3583,31515,Police officer,1971-11-05,301f23b6ee743ebaabe57446cc5da2eb,1352495754,41.981726,-83.939642,False


## EDA

In [3]:
sel_cols = ['trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'trans_amount', 'first_name', 'last_name', 'gender', 'street', 'city',
       'state_code', 'zip', 'latitude', 'longitude', 'city_population', 'job',
       'dob', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']
df_f = df[sel_cols]
df_f.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,trans_amount,first_name,last_name,gender,street,city,...,zip,latitude,longitude,city_population,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,2019-11-09 21:14:17,30029052116970,"fraud_Little, Gutmann and Lynch",shopping_net,2.03,Curtis,Young,M,4319 Watson Shoals Suite 658,Falconer,...,14733,42.1239,-79.1895,3833,Metallurgist,1970-10-09,1352495657,42.962553,-78.427227,False
1,2019-11-09 21:14:49,3596217206093829,fraud_Rippin-VonRueden,health_fitness,7.06,Sara,Ramirez,F,23843 Scott Island,Birmingham,...,52535,40.8626,-91.9534,888,Camera operator,1988-03-25,1352495689,41.78898,-91.348646,False
2,2019-11-09 21:15:01,3526826139003047,fraud_Kuhn Group,food_dining,5.07,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,1352495701,43.679707,-84.268857,False
3,2019-11-09 21:15:30,38052002992326,fraud_Roberts-Beahan,misc_pos,9.96,Anna,Logan,F,3522 Park Wells Suite 528,Cleveland,...,77327,30.33,-95.0202,34153,Building surveyor,1995-09-11,1352495730,31.026641,-95.316548,False
4,2019-11-09 21:15:54,374930071163758,fraud_Abbott-Rogahn,entertainment,14.66,Daniel,Escobar,M,61390 Hayes Port,Romulus,...,48174,42.2203,-83.3583,31515,Police officer,1971-11-05,1352495754,41.981726,-83.939642,False


In [4]:
df_f.nunique()

trans_date_trans_time    908363
cc_num                      960
merchant                    693
category                     14
trans_amount              46254
first_name                  346
last_name                   476
gender                        2
street                      960
city                        876
state_code                   51
zip                         947
latitude                    945
longitude                   946
city_population             862
job                         492
dob                         945
unix_time                908395
merch_lat                899809
merch_long               914228
is_fraud                      2
dtype: int64

In [5]:
df_f.describe(percentiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,0.95])

Unnamed: 0,cc_num,trans_amount,zip,latitude,longitude,city_population,unix_time,merch_lat,merch_long
count,924850.0,924850.0,924850.0,924850.0,924850.0,924850.0,924850.0,924850.0,924850.0
mean,4.168053e+17,70.265398,48799.210043,38.53352,-90.227999,89144.8,1342872000.0,38.533309,-90.228075
std,1.308428e+18,161.713382,26906.331474,5.076327,13.768328,302552.7,9077796.0,5.110696,13.780371
min,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.027785,-166.671242
10%,4464457000000.0,4.11,13304.0,31.6591,-111.1439,260.0,1330093000.0,31.627251,-111.265446
20%,30427040000000.0,7.73,21102.0,33.9056,-98.7858,568.0,1333729000.0,33.816143,-98.856109
30%,213175500000000.0,15.66,29939.0,35.9335,-95.2739,964.0,1337267000.0,35.814856,-95.180454
40%,377654600000000.0,32.0,38761.0,38.2352,-91.0286,1631.0,1340211000.0,38.055495,-91.093436
50%,3520550000000000.0,47.4,48174.0,39.3543,-87.4769,2456.0,1342994000.0,39.363742,-87.435281
60%,3575789000000000.0,60.84,57374.0,40.4931,-84.1481,4726.0,1345875000.0,40.402812,-84.099471


In [6]:
df_f.describe(include = ['O'])

Unnamed: 0,trans_date_trans_time,merchant,category,first_name,last_name,gender,street,city,state_code,job,dob
count,924850,924850,924850,924850,924850,924850,924850,924850,924850,924850,924850
unique,908363,693,14,346,476,2,960,876,51,492,945
top,2019-04-22 16:02:01,fraud_Kilback LLC,gas_transport,Christopher,Smith,F,854 Walker Dale Suite 488,Birmingham,TX,Film/video editor,1977-03-23
freq,4,3121,93859,19052,20589,506117,2260,3993,67676,7021,4018


In [7]:
df_f[((df_f['merch_lat'] > 71) | (df_f['merch_lat'] < 19)) | 
     ((df_f['merch_long'] > -66) | (df_f['merch_long'] < -168))]

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,trans_amount,first_name,last_name,gender,street,city,...,zip,latitude,longitude,city_population,job,dob,unix_time,merch_lat,merch_long,is_fraud


In [8]:
df_f[((df_f['latitude'] > 71) | (df_f['latitude'] < 20)) & 
     ((df_f['longitude'] > -66) | (df_f['longitude'] < -168))]

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,trans_amount,first_name,last_name,gender,street,city,...,zip,latitude,longitude,city_population,job,dob,unix_time,merch_lat,merch_long,is_fraud


In [9]:
df_f[(df_f['latitude'].sub(df_f['merch_lat']).abs() > 2) | (df_f['longitude'].sub(df_f['merch_long']).abs() > 2)]

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,trans_amount,first_name,last_name,gender,street,city,...,zip,latitude,longitude,city_population,job,dob,unix_time,merch_lat,merch_long,is_fraud


In [10]:
df_f.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'trans_amount', 'first_name', 'last_name', 'gender', 'street', 'city',
       'state_code', 'zip', 'latitude', 'longitude', 'city_population', 'job',
       'dob', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

## Limpieza

### Datetime data
- Se procede a convertir tipos de datos en fechas
- Se crean columnas númericas a partir de datos tipo fecha

In [11]:
df_f["dob"] = pd.to_datetime(df_f["dob"], errors="coerce")
print(df_f.dob.dtype)

datetime64[ns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_f["dob"] = pd.to_datetime(df_f["dob"], errors="coerce")


In [12]:
df_f["birth_year"] = df_f.loc[:,"dob"].dt.year
df_f["birth_month"] = df_f.loc[:,"dob"].dt.month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_f["birth_year"] = df_f.loc[:,"dob"].dt.year


In [13]:
df_f.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,trans_amount,first_name,last_name,gender,street,city,...,longitude,city_population,job,dob,unix_time,merch_lat,merch_long,is_fraud,birth_year,birth_month
0,2019-11-09 21:14:17,30029052116970,"fraud_Little, Gutmann and Lynch",shopping_net,2.03,Curtis,Young,M,4319 Watson Shoals Suite 658,Falconer,...,-79.1895,3833,Metallurgist,1970-10-09,1352495657,42.962553,-78.427227,False,1970,10
1,2019-11-09 21:14:49,3596217206093829,fraud_Rippin-VonRueden,health_fitness,7.06,Sara,Ramirez,F,23843 Scott Island,Birmingham,...,-91.9534,888,Camera operator,1988-03-25,1352495689,41.78898,-91.348646,False,1988,3
2,2019-11-09 21:15:01,3526826139003047,fraud_Kuhn Group,food_dining,5.07,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,-85.017,1126,Furniture designer,1955-07-06,1352495701,43.679707,-84.268857,False,1955,7
3,2019-11-09 21:15:30,38052002992326,fraud_Roberts-Beahan,misc_pos,9.96,Anna,Logan,F,3522 Park Wells Suite 528,Cleveland,...,-95.0202,34153,Building surveyor,1995-09-11,1352495730,31.026641,-95.316548,False,1995,9
4,2019-11-09 21:15:54,374930071163758,fraud_Abbott-Rogahn,entertainment,14.66,Daniel,Escobar,M,61390 Hayes Port,Romulus,...,-83.3583,31515,Police officer,1971-11-05,1352495754,41.981726,-83.939642,False,1971,11


### Credit cards
Se asocian los números de tarjetas de crédito con algunos tipos de tarjetas de crédito

*reducir la granularidad de los datos*

In [14]:
def get_card_type(card_number):
    card_number = str(card_number)  # Convertir a cadena
    if card_number.startswith('4'):
        return 'Visa'
    elif card_number[:2] in ('51', '52', '53', '54', '55') or 2221 <= int(card_number[:4]) <= 2720:
        return 'Mastercard'
    elif card_number[:2] in ('34', '37'):
        return 'American Express'
    elif card_number.startswith('6011') or card_number[:3] in [str(x) for x in range(644, 650)] or card_number.startswith('65') or 622126 <= int(card_number[:6]) <= 622925:
        return 'Discover'
    elif 3528 <= int(card_number[:4]) <= 3589:
        return 'JCB'
    elif card_number[:2] in ('36', '38') or card_number[:3] in ('300', '301', '302', '303', '304', '305'):
        return 'Diners Club'
    else:
        return 'Unknown'

# Aplicar la función a la columna
df_f['cc_num'] = df_f['cc_num'].apply(get_card_type)

In [15]:
df_f.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,trans_amount,first_name,last_name,gender,street,city,...,longitude,city_population,job,dob,unix_time,merch_lat,merch_long,is_fraud,birth_year,birth_month
0,2019-11-09 21:14:17,Diners Club,"fraud_Little, Gutmann and Lynch",shopping_net,2.03,Curtis,Young,M,4319 Watson Shoals Suite 658,Falconer,...,-79.1895,3833,Metallurgist,1970-10-09,1352495657,42.962553,-78.427227,False,1970,10
1,2019-11-09 21:14:49,Unknown,fraud_Rippin-VonRueden,health_fitness,7.06,Sara,Ramirez,F,23843 Scott Island,Birmingham,...,-91.9534,888,Camera operator,1988-03-25,1352495689,41.78898,-91.348646,False,1988,3
2,2019-11-09 21:15:01,Unknown,fraud_Kuhn Group,food_dining,5.07,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,-85.017,1126,Furniture designer,1955-07-06,1352495701,43.679707,-84.268857,False,1955,7
3,2019-11-09 21:15:30,Diners Club,fraud_Roberts-Beahan,misc_pos,9.96,Anna,Logan,F,3522 Park Wells Suite 528,Cleveland,...,-95.0202,34153,Building surveyor,1995-09-11,1352495730,31.026641,-95.316548,False,1995,9
4,2019-11-09 21:15:54,American Express,fraud_Abbott-Rogahn,entertainment,14.66,Daniel,Escobar,M,61390 Hayes Port,Romulus,...,-83.3583,31515,Police officer,1971-11-05,1352495754,41.981726,-83.939642,False,1971,11


### Datetime data

In [16]:
df_f["trans_date_trans_time"] = pd.to_datetime(df_f["trans_date_trans_time"], errors="coerce")
print(df_f["trans_date_trans_time"].dtype)  # Debería ser datetime64[ns]

datetime64[ns]


In [17]:
df_f["trans_year"] = df_f.loc[:,"trans_date_trans_time"].dt.year
df_f["trans_month"] = df_f.loc[:,"trans_date_trans_time"].dt.month
df_f["trans_day"] = df_f.loc[:,"trans_date_trans_time"].dt.day
df_f["trans_dayofweek"] = df_f.loc[:,"trans_date_trans_time"].dt.dayofweek
df_f["trans_hour"] = df_f.loc[:,"trans_date_trans_time"].dt.hour

In [18]:
df_f.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,trans_amount,first_name,last_name,gender,street,city,...,merch_lat,merch_long,is_fraud,birth_year,birth_month,trans_year,trans_month,trans_day,trans_dayofweek,trans_hour
0,2019-11-09 21:14:17,Diners Club,"fraud_Little, Gutmann and Lynch",shopping_net,2.03,Curtis,Young,M,4319 Watson Shoals Suite 658,Falconer,...,42.962553,-78.427227,False,1970,10,2019,11,9,5,21
1,2019-11-09 21:14:49,Unknown,fraud_Rippin-VonRueden,health_fitness,7.06,Sara,Ramirez,F,23843 Scott Island,Birmingham,...,41.78898,-91.348646,False,1988,3,2019,11,9,5,21
2,2019-11-09 21:15:01,Unknown,fraud_Kuhn Group,food_dining,5.07,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,43.679707,-84.268857,False,1955,7,2019,11,9,5,21
3,2019-11-09 21:15:30,Diners Club,fraud_Roberts-Beahan,misc_pos,9.96,Anna,Logan,F,3522 Park Wells Suite 528,Cleveland,...,31.026641,-95.316548,False,1995,9,2019,11,9,5,21
4,2019-11-09 21:15:54,American Express,fraud_Abbott-Rogahn,entertainment,14.66,Daniel,Escobar,M,61390 Hayes Port,Romulus,...,41.981726,-83.939642,False,1971,11,2019,11,9,5,21


### Filtering

In [19]:
df_f.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'trans_amount', 'first_name', 'last_name', 'gender', 'street', 'city',
       'state_code', 'zip', 'latitude', 'longitude', 'city_population', 'job',
       'dob', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'birth_year',
       'birth_month', 'trans_year', 'trans_month', 'trans_day',
       'trans_dayofweek', 'trans_hour'],
      dtype='object')

In [20]:
selected_cols = ['cc_num', 'category', 'trans_amount', 'gender','city',
                 'state_code', 'city_population','job','is_fraud','birth_year',
                 'birth_month', 'trans_year', 'trans_month', 'trans_day','trans_dayofweek',
                 'trans_hour']
df_filtered = df_f[selected_cols]
df_filtered.head()

Unnamed: 0,cc_num,category,trans_amount,gender,city,state_code,city_population,job,is_fraud,birth_year,birth_month,trans_year,trans_month,trans_day,trans_dayofweek,trans_hour
0,Diners Club,shopping_net,2.03,M,Falconer,NY,3833,Metallurgist,False,1970,10,2019,11,9,5,21
1,Unknown,health_fitness,7.06,F,Birmingham,IA,888,Camera operator,False,1988,3,2019,11,9,5,21
2,Unknown,food_dining,5.07,M,Falmouth,MI,1126,Furniture designer,False,1955,7,2019,11,9,5,21
3,Diners Club,misc_pos,9.96,F,Cleveland,TX,34153,Building surveyor,False,1995,9,2019,11,9,5,21
4,American Express,entertainment,14.66,M,Romulus,MI,31515,Police officer,False,1971,11,2019,11,9,5,21


### Reduce granularity

Columnas númericas con una gran variedad de datos númericos se reducen, en este caso dependiendo de los percentiles para disminuir la granularidad de los datos, disminuir la complejidad del modelo y aumentar la eficiencia del modelo

In [21]:
percentiles_trans = df_filtered["trans_amount"].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,0.95,0.99,0.9999,0.99999])
bins = [-float('inf')] + list(percentiles_trans.values) + [float('inf')]
labels = [f"{round(p, 4)}" for p in percentiles_trans.values] + ["1.0"]
df_filtered['trans_amount_cat'] = pd.cut(df_filtered['trans_amount'], bins=bins, labels=labels, ordered = False,include_lowest=True)
df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['trans_amount_cat'] = pd.cut(df_filtered['trans_amount'], bins=bins, labels=labels, ordered = False,include_lowest=True)


Unnamed: 0,cc_num,category,trans_amount,gender,city,state_code,city_population,job,is_fraud,birth_year,birth_month,trans_year,trans_month,trans_day,trans_dayofweek,trans_hour,trans_amount_cat
0,Diners Club,shopping_net,2.03,M,Falconer,NY,3833,Metallurgist,False,1970,10,2019,11,9,5,21,4.11
1,Unknown,health_fitness,7.06,F,Birmingham,IA,888,Camera operator,False,1988,3,2019,11,9,5,21,7.73
2,Unknown,food_dining,5.07,M,Falmouth,MI,1126,Furniture designer,False,1955,7,2019,11,9,5,21,7.73
3,Diners Club,misc_pos,9.96,F,Cleveland,TX,34153,Building surveyor,False,1995,9,2019,11,9,5,21,15.66
4,American Express,entertainment,14.66,M,Romulus,MI,31515,Police officer,False,1971,11,2019,11,9,5,21,15.66


In [22]:
percentiles_pop = df_filtered["city_population"].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,0.95])
bins = [-float('inf')] + list(percentiles_pop.values) + [float('inf')]
labels = [f"{round(p, 4)}" for p in percentiles_pop.values] + ["1.0"]
df_filtered['city_pop_cat'] = pd.cut(df_filtered['city_population'], bins=bins, labels=labels, ordered = False,include_lowest=True)
df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['city_pop_cat'] = pd.cut(df_filtered['city_population'], bins=bins, labels=labels, ordered = False,include_lowest=True)


Unnamed: 0,cc_num,category,trans_amount,gender,city,state_code,city_population,job,is_fraud,birth_year,birth_month,trans_year,trans_month,trans_day,trans_dayofweek,trans_hour,trans_amount_cat,city_pop_cat
0,Diners Club,shopping_net,2.03,M,Falconer,NY,3833,Metallurgist,False,1970,10,2019,11,9,5,21,4.11,4726.0
1,Unknown,health_fitness,7.06,F,Birmingham,IA,888,Camera operator,False,1988,3,2019,11,9,5,21,7.73,964.0
2,Unknown,food_dining,5.07,M,Falmouth,MI,1126,Furniture designer,False,1955,7,2019,11,9,5,21,7.73,1631.0
3,Diners Club,misc_pos,9.96,F,Cleveland,TX,34153,Building surveyor,False,1995,9,2019,11,9,5,21,15.66,42619.0
4,American Express,entertainment,14.66,M,Romulus,MI,31515,Police officer,False,1971,11,2019,11,9,5,21,15.66,42619.0


### Filtering

In [23]:
df_filtered.columns

Index(['cc_num', 'category', 'trans_amount', 'gender', 'city', 'state_code',
       'city_population', 'job', 'is_fraud', 'birth_year', 'birth_month',
       'trans_year', 'trans_month', 'trans_day', 'trans_dayofweek',
       'trans_hour', 'trans_amount_cat', 'city_pop_cat'],
      dtype='object')

In [24]:
df_fi = df_filtered[['cc_num', 'category','gender', 'city', 'state_code',
                    'job', 'is_fraud', 'birth_year', 'birth_month','trans_year',
                    'trans_month', 'trans_day', 'trans_dayofweek', 'trans_hour','trans_amount_cat',
                    'city_pop_cat']]
df_fi.head()

Unnamed: 0,cc_num,category,gender,city,state_code,job,is_fraud,birth_year,birth_month,trans_year,trans_month,trans_day,trans_dayofweek,trans_hour,trans_amount_cat,city_pop_cat
0,Diners Club,shopping_net,M,Falconer,NY,Metallurgist,False,1970,10,2019,11,9,5,21,4.11,4726.0
1,Unknown,health_fitness,F,Birmingham,IA,Camera operator,False,1988,3,2019,11,9,5,21,7.73,964.0
2,Unknown,food_dining,M,Falmouth,MI,Furniture designer,False,1955,7,2019,11,9,5,21,7.73,1631.0
3,Diners Club,misc_pos,F,Cleveland,TX,Building surveyor,False,1995,9,2019,11,9,5,21,15.66,42619.0
4,American Express,entertainment,M,Romulus,MI,Police officer,False,1971,11,2019,11,9,5,21,15.66,42619.0


### Numerical data

Se transforman las columnas con texto en números para proceder con el modelo

In [25]:
categorical_cols = df_fi.select_dtypes(include=['object']).columns
# Usar OrdinalEncoder para convertirlas
encoder = OrdinalEncoder()
df_fi[categorical_cols] = encoder.fit_transform(df_fi[categorical_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fi[categorical_cols] = encoder.fit_transform(df_fi[categorical_cols])


In [26]:
df_fi.head()

Unnamed: 0,cc_num,category,gender,city,state_code,job,is_fraud,birth_year,birth_month,trans_year,trans_month,trans_day,trans_dayofweek,trans_hour,trans_amount_cat,city_pop_cat
0,1.0,11.0,1.0,253.0,34.0,296.0,False,1970,10,2019,11,9,5,21,4.11,4726.0
1,5.0,5.0,0.0,73.0,12.0,60.0,False,1988,3,2019,11,9,5,21,7.73,964.0
2,5.0,1.0,1.0,256.0,22.0,204.0,False,1955,7,2019,11,9,5,21,7.73,1631.0
3,1.0,9.0,0.0,150.0,43.0,55.0,False,1995,9,2019,11,9,5,21,15.66,42619.0
4,0.0,0.0,1.0,689.0,22.0,345.0,False,1971,11,2019,11,9,5,21,15.66,42619.0


In [27]:
df_fi["is_fraud"] = df_fi["is_fraud"].astype(int)
df_fi.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fi["is_fraud"] = df_fi["is_fraud"].astype(int)


Unnamed: 0,cc_num,category,gender,city,state_code,job,is_fraud,birth_year,birth_month,trans_year,trans_month,trans_day,trans_dayofweek,trans_hour,trans_amount_cat,city_pop_cat
0,1.0,11.0,1.0,253.0,34.0,296.0,0,1970,10,2019,11,9,5,21,4.11,4726.0
1,5.0,5.0,0.0,73.0,12.0,60.0,0,1988,3,2019,11,9,5,21,7.73,964.0
2,5.0,1.0,1.0,256.0,22.0,204.0,0,1955,7,2019,11,9,5,21,7.73,1631.0
3,1.0,9.0,0.0,150.0,43.0,55.0,0,1995,9,2019,11,9,5,21,15.66,42619.0
4,0.0,0.0,1.0,689.0,22.0,345.0,0,1971,11,2019,11,9,5,21,15.66,42619.0


In [28]:
df_fi.nunique()

cc_num                7
category             14
gender                2
city                876
state_code           51
job                 492
is_fraud              2
birth_year           81
birth_month          12
trans_year            1
trans_month          12
trans_day            31
trans_dayofweek       7
trans_hour           24
trans_amount_cat     14
city_pop_cat         11
dtype: int64

## Modelo

In [42]:
# Separar el target y las características
X = df_fi.drop(columns=['is_fraud'])
y = df_fi['is_fraud']

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Escalar las columnas numéricas
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Definir y entrenar modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
}

# Evaluar cada modelo
for name, model in models.items():
    # Entrenar el modelo
    model.fit(X_train, y_train)
    
    # Predicciones
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Métricas de evaluación
    print(f"--- {name} ---")
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
    print("Classification Report:\n", classification_report(y_test, y_pred))

--- XGBoost ---
ROC AUC: 0.9981821566336253
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    183926
           1       0.97      0.83      0.89      1044

    accuracy                           1.00    184970
   macro avg       0.98      0.92      0.95    184970
weighted avg       1.00      1.00      1.00    184970

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    183926
           1       0.97      0.83      0.89      1044

    accuracy                           1.00    184970
   macro avg       0.98      0.92      0.95    184970
weighted avg       1.00      1.00      1.00    184970



### Modelo Seleccionado

In [43]:
# Definir el modelo XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')

# Entrenar el modelo
xgb_model.fit(X_train, y_train)

# Predicciones
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Imprimir las métricas de evaluación
print("--- XGBoost ---")
print("ROC AUC:", roc_auc_score(y_test, y_proba_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

--- XGBoost ---
ROC AUC: 0.9981821566336253
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    183926
           1       0.97      0.83      0.89      1044

    accuracy                           1.00    184970
   macro avg       0.98      0.92      0.95    184970
weighted avg       1.00      1.00      1.00    184970



## Guardado

In [44]:
# Guardar el encoder (OrdinalEncoder)
joblib.dump(encoder, '../models/ordinal_encoder.pkl')

# Guardar el scaler (StandardScaler)
joblib.dump(scaler, '../models/scaler.pkl')

# Guardar el modelo
joblib.dump(xgb_model, '../models/xgboost_model.pkl')

['xgboost_model.pkl']