<a href="https://colab.research.google.com/github/LCaravaggio/AnalisisPredictivo/blob/master/06_SVM/Default_classification_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cargar datos

In [1]:
import json
from google.colab import drive

!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

drive.mount('/content/drive', force_remount=True)
with open("/content/drive/My Drive/kaggle.json", 'r') as f:
    api_token= json.load(f)

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d uciml/default-of-credit-card-clients-dataset

Mounted at /content/drive
Downloading default-of-credit-card-clients-dataset.zip to /content
100% 0.98M/0.98M [00:00<00:00, 1.97MB/s]
100% 0.98M/0.98M [00:00<00:00, 1.97MB/s]


In [2]:
import zipfile
import os

os.listdir()

for file in os.listdir():
    if file.endswith('.zip'):
      zip_ref = zipfile.ZipFile(file, 'r')
      zip_ref.extractall()
      zip_ref.close()

In [3]:
import pandas as pd
# Cargamos la base
df = pd.read_csv('/content/UCI_Credit_Card.csv')

In [4]:
df.rename({'default.payment.next.month' : 'Default'} ,axis=1 , inplace=True)
df.drop('ID' , axis=1 , inplace=True)

In [5]:
# Eliminamos nan. Tal vez no sea la mejor manera, pero son pocos
df = df.loc[(df['MARRIAGE']!=0) & (df['EDUCATION']!=0)]

In [6]:
# Vamos a entrenar con menos observaciones
from sklearn.utils import resample

df_default = df[df['Default']==1]
df_no_default = df[df['Default']==0]

df_default_downsampled = resample(df_default , replace=False , n_samples=1000 , random_state=42)
df_no_default_downsampled = resample(df_no_default , replace=False , n_samples=1000 , random_state=42)

df_downsampled = pd.concat([df_default_downsampled,df_no_default_downsampled])

In [7]:
X = df_downsampled.drop('Default' ,axis=1)
Y = df_downsampled['Default']

In [8]:
X_encoded = pd.get_dummies(X, columns = ['SEX' ,
                                         'EDUCATION' ,
                                         'MARRIAGE' ,
                                         'PAY_0',
                                         'PAY_2',
                                         'PAY_3',
                                         'PAY_4',
                                         'PAY_5',
                                         'PAY_6'])

In [9]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train, y_test = train_test_split(X_encoded , Y , random_state=42)

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

# A entrenar modelos

In [11]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train_scaled , y_train)

In [12]:
from sklearn.metrics import accuracy_score

y_pred_train = clf.predict(X_train_scaled)
y_pred_test = clf.predict(X_test_scaled)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.7433333333333333
Accuracy test: 0.726


In [13]:
clf=SVC(C=500 , gamma=0.001 ,kernel='rbf')
clf.fit(X_train_scaled , y_train)

In [14]:
y_pred_train = clf.predict(X_train_scaled)
y_pred_test = clf.predict(X_test_scaled)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.7773333333333333
Accuracy test: 0.688


# Probar Reducir la dimensionalidad

In [15]:
from sklearn.manifold import TSNE

tsne=TSNE(n_components=2, init="pca",learning_rate="auto", n_iter=500, n_iter_without_progress=150, n_jobs=2)
x_train_reduced=tsne.fit_transform(X_train_scaled)
x_test_reduced=tsne.fit_transform(X_test_scaled)

In [16]:
clf=SVC()
clf.fit(x_train_reduced , y_train)

In [17]:
y_pred_train = clf.predict(x_train_reduced)
y_pred_test = clf.predict(x_test_reduced)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.6693333333333333
Accuracy test: 0.672


# ¿Y un RF?

In [18]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 1.0
Accuracy test: 0.712


In [19]:
clf = RandomForestClassifier(n_estimators=80, max_depth=6)

clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.7766666666666666
Accuracy test: 0.712


In [20]:
clf = RandomForestClassifier(n_estimators=80,max_depth=11)

clf.fit(x_train_reduced, y_train)

y_pred_train = clf.predict(x_train_reduced)
y_pred_test = clf.predict(x_test_reduced)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.9086666666666666
Accuracy test: 0.618


# ¿Y un Catboost?

In [21]:
%%capture
!pip install catboost

In [22]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(silent=True)
clf = clf.fit(X_train, y_train)


y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.8993333333333333
Accuracy test: 0.736


# Gridsearch

In [23]:
from sklearn.model_selection import GridSearchCV
param_grid={
    'C':[0.5 , 1 ,10,100], 
    'gamma':['scale' ,0.1,0.001,0.0001,1],
    'kernel':['rbf']
}

optimal_params = GridSearchCV(SVC() ,param_grid ,cv=5 ,scoring='accuracy',verbose=False)

In [25]:
optimal_params.fit(X_train_scaled , y_train)
print(optimal_params.best_params_)

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}


In [26]:
clf=SVC(C=100 , gamma=0.001 ,kernel='rbf')
clf.fit(X_train_scaled , y_train)

In [27]:
y_pred_train = clf.predict(X_train_scaled)
y_pred_test = clf.predict(X_test_scaled)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.748
Accuracy test: 0.702


In [31]:
param_grid={
    'depth':[2 , 4, 8]
}

optimal_params = GridSearchCV(CatBoostClassifier(silent=True) ,param_grid ,cv=5 ,scoring='accuracy', verbose=False)

In [32]:
optimal_params.fit(X_train_scaled , y_train)
print(optimal_params.best_params_)

{'depth': 2}


In [34]:
clf=CatBoostClassifier(silent=True, depth=2)
clf.fit(X_train_scaled , y_train)

<catboost.core.CatBoostClassifier at 0x7f444e7bdb70>

In [35]:
y_pred_train = clf.predict(X_train_scaled)
y_pred_test = clf.predict(X_test_scaled)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.7486666666666667
Accuracy test: 0.712
