# Data Cleaning

In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("sba_national.csv", low_memory=False)
df.shape

(899164, 27)

## Data Validation

In [None]:
df.dtypes

LoanNr_ChkDgt          int64
Name                  object
City                  object
State                 object
Zip                    int64
Bank                  object
BankState             object
NAICS                  int64
ApprovalDate          object
ApprovalFY            object
Term                   int64
NoEmp                  int64
NewExist             float64
CreateJob              int64
RetainedJob            int64
FranchiseCode          int64
UrbanRural             int64
RevLineCr             object
LowDoc                object
ChgOffDate            object
DisbursementDate      object
DisbursementGross     object
BalanceGross          object
MIS_Status            object
ChgOffPrinGr          object
GrAppv                object
SBA_Appv              object
dtype: object

### LoanNr_ChkDgt, Name, City, Zip, Bank, FranchiseCode, BalanceGross, ChgOffDate y ChgOffPrinGr

In [None]:
drop_cols = ["LoanNr_ChkDgt",   # tiene 100% de valores distintos
             "Name",            # tiene 86.7% de valores distintos
             "City",            # cuenta con 32581 ciudades diferentes
             "Zip",             # cuenta con 33611 codigos postales diferentes
             "Bank",            # cuenta con 5802 bancos diferentes
             "FranchiseCode",   # tiene 94.2% de codigos sin franquicia
             "BalanceGross",    # tiene practicamente 100% de valores en cero
             "ChgOffDate",      # fuga de informacion de la variable objetivo
             "ChgOffPrinGr"]    # fuga de informacion de la variable objetivo

In [None]:
df.drop(drop_cols, axis=1, inplace=True)
df.shape

(899164, 18)

### ApprovalDate, ApprovalFY y DisbursementDate

In [None]:
drop_cols = ["ApprovalDate",    # comportamiento similar a DisbursementDate
             "ApprovalFY"]      # comportamiento similar a DisbursementDate

In [None]:
# remplazamos las fechas de desembolsos nulas por las fechas de aprobacion
df["DisbursementDate"] = np.where(df["DisbursementDate"].isnull(), df["ApprovalDate"], df["DisbursementDate"])

In [None]:
# formateamos la fecha debido a que el año tiene solamente dos digitos
def format_date(app_date):
    date = app_date.split("-")
    year = date[2]
    if int(year) > 14: year = "19" + year
    else: year = "20" + year
    return f"{date[0]}-{date[1]}-{year}"

df["DisbursementDate"] = pd.to_datetime(df["DisbursementDate"].apply(format_date))

In [None]:
df.drop(drop_cols, axis=1, inplace=True)
df.shape

(899164, 16)

### State, BankState, Term, NoEmp, CreateJob, RetainedJob y UrbanRural

In [None]:
df["State"].unique()

array(['IN', 'OK', 'FL', 'CT', 'NJ', 'NC', 'IL', 'RI', 'TX', 'VA', 'TN',
       'AR', 'MN', 'MO', 'MA', 'CA', 'SC', 'LA', 'IA', 'OH', 'KY', 'MS',
       'NY', 'MD', 'PA', 'OR', 'ME', 'KS', 'MI', 'AK', 'WA', 'CO', 'MT',
       'WY', 'UT', 'NH', 'WV', 'ID', 'AZ', 'NV', 'WI', 'NM', 'GA', 'ND',
       'VT', 'AL', 'NE', 'SD', 'HI', 'DE', 'DC', nan], dtype=object)

In [None]:
df["BankState"].unique()

array(['OH', 'IN', 'OK', 'FL', 'DE', 'SD', 'AL', 'CT', 'GA', 'OR', 'MN',
       'RI', 'NC', 'TX', 'MD', 'NY', 'TN', 'SC', 'MS', 'MA', 'LA', 'IA',
       'VA', 'CA', 'IL', 'KY', 'PA', 'MO', 'WA', 'MI', 'UT', 'KS', 'WV',
       'WI', 'AZ', 'NJ', 'CO', 'ME', 'NH', 'AR', 'ND', 'MT', 'ID', nan,
       'WY', 'NM', 'DC', 'NV', 'NE', 'PR', 'HI', 'VT', 'AK', 'GU', 'AN',
       'EN', 'VI'], dtype=object)

In [None]:
df["Term"].value_counts()

84     230162
60      89945
240     85982
120     77654
300     44727
        ...  
438         1
382         1
367         1
374         1
429         1
Name: Term, Length: 412, dtype: int64

In [None]:
df["NoEmp"].value_counts()

1       154254
2       138297
3        90674
4        73644
5        60319
         ...  
414          1
604          1
1280         1
3089         1
3713         1
Name: NoEmp, Length: 599, dtype: int64

In [None]:
df["CreateJob"].value_counts()

0      629248
1       63174
2       57831
3       28806
4       20511
        ...  
171         1
146         1
214         1
179         1
860         1
Name: CreateJob, Length: 246, dtype: int64

In [None]:
df["RetainedJob"].value_counts()

0      440403
1       88790
2       76851
3       49963
4       39666
        ...  
191         1
247         1
226         1
675         1
198         1
Name: RetainedJob, Length: 358, dtype: int64

In [None]:
df["UrbanRural"].value_counts()

1    470654
0    323167
2    105343
Name: UrbanRural, dtype: int64

### DisbursementGross, GrAppv, SBA_Appv

In [None]:
# DisbursementGross: formateamos la variable de currency a entero
df["DisbursementGross"] = df["DisbursementGross"].str.replace(r"[\$,]", "", regex=True).astype(float)
df["DisbursementGross"] = df["DisbursementGross"].round().astype(int)
df["DisbursementGross"].value_counts()

50000      43787
100000     36714
25000      27387
150000     23373
10000      21328
           ...  
143118         1
23827          1
157338         1
60698          1
1086300        1
Name: DisbursementGross, Length: 118859, dtype: int64

In [None]:
# GrAppv: formateamos la variable de currency a entero
df["GrAppv"] = df["GrAppv"].str.replace(r"[\$,]", "", regex=True).astype(float)
df["GrAppv"] = df["GrAppv"].round().astype(int)
df["GrAppv"].value_counts()

50000      69394
25000      51258
100000     50977
10000      38366
150000     27624
           ...  
407542         1
220900         1
57587          1
531577         1
1086300        1
Name: GrAppv, Length: 22128, dtype: int64

In [None]:
# SBA_Appv: formateamos la variable de currency a entero
df["SBA_Appv"] = df["SBA_Appv"].str.replace(r"[\$,]", "", regex=True).astype(float)
df["SBA_Appv"] = df["SBA_Appv"].round().astype(int)
df["SBA_Appv"].value_counts()

25000     49579
12500     40147
5000      31135
50000     25047
10000     17009
          ...  
192949        1
212352        1
57072         1
32938         1
715674        1
Name: SBA_Appv, Length: 38326, dtype: int64

### NAICS

In [None]:
sector = {"11":"Agriculture, forestry, fishing and hunting",
          "21":"Mining, quarrying, and oil and gas extraction",
          "22":"Utilities",
          "23":"Construction",
          "31":"Manufacturing",
          "32":"Manufacturing",
          "33":"Manufacturing",
          "42":"Wholesale trade",
          "44":"Retail trade",
          "45":"Retail trade",
          "48":"Transportation and warehousing",
          "49":"Transportation and warehousing",
          "51":"Information",
          "52":"Finance and insurance",
          "53":"Real estate and rental and leasing",
          "54":"Professional, scientific, and technical services",
          "55":"Management of companies and enterprises",
          "56":"Administrative and support and waste management and remediation services",
          "61":"Educational services",
          "62":"Health care and social assistance",
          "71":"Arts, entertainment, and recreation",
          "72":"Accommodation and food services",
          "81":"Other services (except public administration)",
          "92":"Public administration",
          "0":"[Unallocated sector]"} # creamos la categoria "sector no asignado"}

In [None]:
# asignamos a NAICS el nombre del sector correspondiente
df["NAICS"] = df["NAICS"].astype(str).str[0:2]
df["NAICS"] = df["NAICS"].apply(lambda x: sector[x])
df["NAICS"]

0                              Retail trade
1           Accommodation and food services
2         Health care and social assistance
3                      [Unallocated sector]
4                      [Unallocated sector]
                        ...                
899159                         Retail trade
899160                         Retail trade
899161                        Manufacturing
899162                 [Unallocated sector]
899163                 [Unallocated sector]
Name: NAICS, Length: 899164, dtype: object

### NewExist, RevLineCr y LowDoc

In [None]:
# NewExist: convertimos a NaN los valores no definidos
df["NewExist"] = np.where(df["NewExist"]==0.0, np.nan, df["NewExist"])
df["NewExist"] = df["NewExist"].astype("Int64")
df["NewExist"].value_counts()

1    644869
2    253125
Name: NewExist, dtype: Int64

In [None]:
# RevLineCr: convertimos a enteros los valores definidos y a NaN los demas
# asumiremos que T (true) y 1 podrian ser Y (yes), y 0 podria ser N (no)
df["RevLineCr"] = np.where(df["RevLineCr"].isin(["Y","T"]), "1", df["RevLineCr"])
df["RevLineCr"] = np.where(df["RevLineCr"].isin(["N"]), "0", df["RevLineCr"])
df["RevLineCr"] = np.where(~df["RevLineCr"].isin(["1","0"]), np.nan, df["RevLineCr"])
df["RevLineCr"] = df["RevLineCr"].astype("Int64")
df["RevLineCr"].value_counts()

0    677890
1    216704
Name: RevLineCr, dtype: Int64

In [None]:
# LowDoc: convertimos a enteros los valores definidos y a NaN los demas
# asumiremos que 1 podria ser Y (yes), y que 0 podria ser N (no)
df["LowDoc"] = np.where(df["LowDoc"]=="Y", "1", df["LowDoc"])
df["LowDoc"] = np.where(df["LowDoc"]=="N", "0", df["LowDoc"])
df["LowDoc"] = np.where(~df["LowDoc"].isin(["1","0"]), np.nan, df["LowDoc"])
df["LowDoc"] = df["LowDoc"].astype("Int64")
df["LowDoc"].value_counts()

0    784313
1    110336
Name: LowDoc, dtype: Int64

### MIS_Status

In [None]:
# MIS_Status: convertimos los valores a enteros (1 = Default y 0 = Paid)
df["MIS_Status"] = np.where(df["MIS_Status"]=="CHGOFF", 1, df["MIS_Status"])
df["MIS_Status"] = np.where(df["MIS_Status"]=="P I F", 0, df["MIS_Status"])
df["MIS_Status"] = df["MIS_Status"].astype("Int64")
df["MIS_Status"].value_counts()

0    739609
1    157558
Name: MIS_Status, dtype: Int64

### **Summary**

In [None]:
# adicionalmente se visualizan y eliminan las instancias duplicadas
print(df[df.duplicated].shape)
display(df[df.duplicated].sample(3))

df.drop_duplicates(inplace=True)

(1829, 16)


Unnamed: 0,State,BankState,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,MIS_Status,GrAppv,SBA_Appv
722253,IN,VA,Retail trade,84,4,1,0,4,1,0,0,2004-10-31,50000,0,50000,25000
777808,WY,WY,[Unallocated sector],60,1,1,0,0,0,0,1,1995-04-30,25000,0,25000,21500
18179,CA,NC,"Professional, scientific, and technical services",84,1,1,0,0,1,1,0,2005-01-31,50000,0,50000,25000


In [None]:
print(df.shape)
df.dtypes

(897335, 16)


State                        object
BankState                    object
NAICS                        object
Term                          int64
NoEmp                         int64
NewExist                      Int64
CreateJob                     int64
RetainedJob                   int64
UrbanRural                    int64
RevLineCr                     Int64
LowDoc                        Int64
DisbursementDate     datetime64[ns]
DisbursementGross             int64
MIS_Status                    Int64
GrAppv                        int64
SBA_Appv                      int64
dtype: object

## Missing Values

In [21]:
# df.to_csv("sba_clean_v1.csv", index=False)
df = pd.read_csv("sba_clean_v1.csv", low_memory=False)
df.shape

(897335, 16)

In [5]:
df.isnull().sum()

State                  14
BankState            1562
NAICS                   0
Term                    0
NoEmp                   0
NewExist             1169
CreateJob               0
RetainedJob             0
UrbanRural              0
RevLineCr            4566
LowDoc               4512
DisbursementDate        0
DisbursementGross       0
MIS_Status           1951
GrAppv                  0
SBA_Appv                0
dtype: int64

### State

In [8]:
# imputamos los valores faltantes con la moda de la variable
mode = df["State"].mode()[0]
df["State"] = df["State"].fillna(mode)

### BankState

In [12]:
# imputamos los valores faltantes con la moda de la variable
col_values = df["BankState"]
mode_imputer = SimpleImputer(strategy="most_frequent")
mode_imputer_fit = mode_imputer.fit(pd.DataFrame(col_values))
df["BankState"] = mode_imputer_fit.transform(pd.DataFrame(col_values))

### NewExist

In [17]:
# aplicamos imputacion multivariante de faltantes con KNN Imputer
cols_fit = df[["NewExist", "Term", "NoEmp", "CreateJob", "RetainedJob",
               "UrbanRural", "DisbursementGross", "GrAppv", "SBA_Appv"]]
k = int(round(np.sqrt(df.shape[0]), 2))
knn_imputer = KNNImputer(n_neighbors=k)
knn_imputer_fit = knn_imputer.fit(cols_fit)

cols_imputed = knn_imputer_fit.transform(cols_fit)
df["NewExist"] = cols_imputed[:, 0]

# redondear y convertir a entero...
df["NewExist"].value_counts()

1.000000    642126
2.000000    252097
1.226216        13
1.232558        12
1.229387        12
             ...  
1.377378         1
1.091966         1
1.264271         1
1.440803         1
1.550740         1
Name: NewExist, Length: 362, dtype: int64

### RevLineCr

In [19]:
# imputamos valores categoricos faltantes utilizando modelos de clasificacion

# Dividir el DataFrame en conjunto de entrenamiento y conjunto de prueba
df_train = df.dropna(subset=['A'])  # Conjunto de entrenamiento sin valores faltantes en la columna "A"
df_test = df[df['A'].isnull()]     # Conjunto de prueba con valores faltantes en la columna "A"

# Dividir los conjuntos de entrenamiento y prueba en variables predictoras y variable objetivo
X_train = df_train.drop(columns=['A'])  # Variables predictoras del conjunto de entrenamiento
y_train = df_train['A']                 # Variable objetivo (columna "A") del conjunto de entrenamiento
X_test = df_test.drop(columns=['A'])    # Variables predictoras del conjunto de prueba

# Crear y entrenar un modelo RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Predecir los valores faltantes en la columna "A" utilizando el modelo entrenado
columna_a_imputar_categorica_imputada = rf_classifier.predict(X_test)

# Asignar los valores imputados en la columna original del DataFrame
df.loc[df['A'].isnull(), 'A'] = columna_a_imputar_categorica_imputada

### LowDoc

In [None]:
# imputamos valores categoricos faltantes utilizando modelos de clasificacion


### MIS_Status

In [14]:
# eliminamos todas las instancias nulas de la variable objetivo
df.dropna(subset=["MIS_Status"], inplace=True)
df["MIS_Status"] = df["MIS_Status"].astype(int)
df.shape

(895384, 16)

### **Summary**

In [18]:
print(df.shape)
df.isnull().sum()

(895384, 16)


State                   0
BankState               0
NAICS                   0
Term                    0
NoEmp                   0
NewExist                0
CreateJob               0
RetainedJob             0
UrbanRural              0
RevLineCr            4558
LowDoc               4505
DisbursementDate        0
DisbursementGross       0
MIS_Status              0
GrAppv                  0
SBA_Appv                0
dtype: int64

## Outlier Detection

In [None]:
df.to_csv("sba_clean_v2.csv", index=False)
# df = pd.read_csv("sba_clean_v2.csv", low_memory=False)
# df.shape

(895371, 16)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 895384 entries, 0 to 897334
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   State              895384 non-null  object 
 1   BankState          895384 non-null  object 
 2   NAICS              895384 non-null  object 
 3   Term               895384 non-null  int64  
 4   NoEmp              895384 non-null  int64  
 5   NewExist           894223 non-null  float64
 6   CreateJob          895384 non-null  int64  
 7   RetainedJob        895384 non-null  int64  
 8   UrbanRural         895384 non-null  int64  
 9   RevLineCr          890826 non-null  float64
 10  LowDoc             890879 non-null  float64
 11  DisbursementDate   895384 non-null  object 
 12  DisbursementGross  895384 non-null  int64  
 13  MIS_Status         895384 non-null  int64  
 14  GrAppv             895384 non-null  int64  
 15  SBA_Appv           895384 non-null  int64  
dtypes:

### Term

### NoEmp

### CreateJob

### RetainedJob

### DisbursementGross

### GrAppv

### SBA_Appv

### **Summary**

In [None]:
print(df.shape)
df.sample(3)

(895371, 16)


Unnamed: 0,State,BankState,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,MIS_Status,GrAppv,SBA_Appv
80991,IL,SD,Health care and social assistance,120,2,2,0,0,0,0.0,0.0,1998-03-31,303000,0.0,365000,273750
439573,LA,LA,Accommodation and food services,120,30,1,0,0,1,0.0,1.0,2001-02-28,100000,0.0,100000,85000
879162,OK,OK,[Unallocated sector],120,8,1,0,0,0,0.0,0.0,1997-02-28,350000,0.0,350000,262500
