<a href="https://colab.research.google.com/github/MaxiPerrone/machine-learning/blob/main/3_Preparacion_conjunto_datos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
import pandas as pd
import kagglehub
import os

dataset_path = kagglehub.dataset_download("dhanushnarayananr/credit-card-fraud")
csv_file = os.path.join(dataset_path, "card_transdata.csv")

In [65]:
df_orig = pd.read_csv(csv_file);
df = df_orig.copy()

In [66]:
import numpy as np

df['card_type'] = np.random.choice(['debit', 'credit', 'prepaid'], size=len(df), p=[0.7, 0.2, 0.1])

In [67]:
df['amount'] = np.round(np.random.uniform(1, 5000, size=len(df)), 2)
df.loc[(df['amount'] > 1000) | (df['amount'] < 10), 'amount'] = np.nan

In [68]:
from sklearn.model_selection import train_test_split

def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
  strat = df[stratify] if stratify else None
  train_set, test_set = train_test_split(
    df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
  strat = test_set[stratify] if stratify else None
  val_set, test_set = train_test_split(
    test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
  return (train_set, val_set, test_set)

In [69]:
train_set, val_set, test_set = train_val_test_split(df, stratify='card_type')

In [70]:
print("Training set length: ", len(train_set))
print("Testing set length: ", len(test_set))
print("Validation set length: ", len(val_set))

Training set length:  600000
Testing set length:  200000
Validation set length:  200000


In [71]:
X_train = train_set.drop("fraud", axis=1)
y_train = train_set["fraud"].copy()

In [72]:
X_train.isna().any()

Unnamed: 0,0
distance_from_home,False
distance_from_last_transaction,False
ratio_to_median_purchase_price,False
repeat_retailer,False
used_chip,False
used_pin_number,False
online_order,False
card_type,False
amount,True


In [73]:
X_train.head(10)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,card_type,amount
630755,69.145587,85.843107,0.16811,1.0,0.0,0.0,1.0,debit,
252857,37.183254,0.077083,1.173941,1.0,0.0,0.0,1.0,prepaid,
811964,68.235343,2.870761,4.137703,1.0,1.0,0.0,0.0,debit,113.23
351307,13.981492,2.42873,0.320065,1.0,0.0,0.0,0.0,credit,
54915,0.613866,1.062319,0.996665,0.0,0.0,0.0,0.0,debit,220.93
701794,12.104779,0.03505,0.458497,1.0,0.0,0.0,0.0,debit,
507205,4.036153,5.63831,0.720979,1.0,1.0,0.0,1.0,debit,
428083,4.783994,1.587495,0.169164,1.0,0.0,0.0,1.0,prepaid,
453868,17.137055,0.208326,9.227288,1.0,0.0,0.0,1.0,debit,57.92
191027,3.830826,14.62202,1.395116,1.0,0.0,0.0,0.0,debit,


In [74]:
print("Null values in amount: ", X_train["amount"].isna().sum())

Null values in amount:  481273


In [75]:
rows_null_values = X_train[X_train.isnull().any(axis=1)]
rows_null_values

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,card_type,amount
630755,69.145587,85.843107,0.168110,1.0,0.0,0.0,1.0,debit,
252857,37.183254,0.077083,1.173941,1.0,0.0,0.0,1.0,prepaid,
351307,13.981492,2.428730,0.320065,1.0,0.0,0.0,0.0,credit,
701794,12.104779,0.035050,0.458497,1.0,0.0,0.0,0.0,debit,
507205,4.036153,5.638310,0.720979,1.0,1.0,0.0,1.0,debit,
...,...,...,...,...,...,...,...,...,...
14539,6.508091,0.161681,0.669165,1.0,0.0,0.0,1.0,debit,
283010,22.071720,0.126742,1.321107,1.0,0.0,0.0,1.0,debit,
685698,3.940439,0.565741,1.377652,1.0,0.0,1.0,1.0,credit,
652578,5.883745,0.782946,1.292113,1.0,0.0,0.0,0.0,debit,


In [76]:
X_train_copy = X_train.copy()
X_train_copy.dropna(subset=["amount"], inplace=True)

print(X_train.shape)
print(X_train_copy.shape)

(600000, 9)
(118727, 9)


In [77]:
X_train_copy = X_train.copy()
X_train_copy.drop("amount", axis=1, inplace=True)

X_train_copy

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,card_type
630755,69.145587,85.843107,0.168110,1.0,0.0,0.0,1.0,debit
252857,37.183254,0.077083,1.173941,1.0,0.0,0.0,1.0,prepaid
811964,68.235343,2.870761,4.137703,1.0,1.0,0.0,0.0,debit
351307,13.981492,2.428730,0.320065,1.0,0.0,0.0,0.0,credit
54915,0.613866,1.062319,0.996665,0.0,0.0,0.0,0.0,debit
...,...,...,...,...,...,...,...,...
14539,6.508091,0.161681,0.669165,1.0,0.0,0.0,1.0,debit
283010,22.071720,0.126742,1.321107,1.0,0.0,0.0,1.0,debit
685698,3.940439,0.565741,1.377652,1.0,0.0,1.0,1.0,credit
652578,5.883745,0.782946,1.292113,1.0,0.0,0.0,0.0,debit


In [78]:
print("Null values in amount: ", X_train["amount"].isna().sum())

amount_mean = X_train["amount"].mean()
X_train["amount"] = X_train["amount"].fillna(amount_mean)

print("Null values in amount: ", X_train["amount"].isna().sum())

Null values in amount:  481273
Null values in amount:  0


In [82]:
card_type = X_train["card_type"]
converted, categories = card_type.factorize()

In [83]:
for i in range(10):
  print(card_type.iloc[i], converted[i])

debit 0
prepaid 1
debit 0
credit 2
debit 0
debit 0
debit 0
prepaid 1
debit 0
debit 0


In [89]:
from sklearn.preprocessing import OrdinalEncoder

card_type_df = X_train[["card_type"]]
encoder = OrdinalEncoder()
card_type_encoded = encoder.fit_transform(card_type_df)

In [88]:
card_type_encoded

array([[1.],
       [2.],
       [1.],
       ...,
       [0.],
       [1.],
       [0.]])

In [91]:
from sklearn.preprocessing import OneHotEncoder

card_type_df = X_train[["card_type"]]
encoder = OneHotEncoder()
card_type_encoded = encoder.fit_transform(card_type_df)

card_type_encoded.toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [97]:
from sklearn.preprocessing import RobustScaler

to_scale = X_train[["amount"]]
scaler = RobustScaler()
scaled = scaler.fit_transform(to_scale)

scaled = pd.DataFrame(scaled, columns=["amount"])
scaled.head(10)

Unnamed: 0,amount
0,0.0
1,0.0
2,-390.871652
3,0.0
4,-283.171652
5,0.0
6,0.0
7,0.0
8,-446.181652
9,0.0
