<a href="https://colab.research.google.com/github/LCaravaggio/AnalisisPredictivo/blob/master/06_SVM/Default_classification_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cargar datos

In [26]:
import json
from google.colab import drive

!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

drive.mount('/content/drive', force_remount=True)
with open("/content/drive/My Drive/kaggle.json", 'r') as f:
    api_token= json.load(f)

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d uciml/default-of-credit-card-clients-dataset

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Mounted at /content/drive
Downloading default-of-credit-card-clients-dataset.zip to /content
  0% 0.00/0.98M [00:00<?, ?B/s]
100% 0.98M/0.98M [00:00<00:00, 122MB/s]


In [27]:
import zipfile
import os

os.listdir()

for file in os.listdir():
    if file.endswith('.zip'):
      zip_ref = zipfile.ZipFile(file, 'r')
      zip_ref.extractall()
      zip_ref.close()

In [28]:
import pandas as pd
# Cargamos la base
df = pd.read_csv('/content/UCI_Credit_Card.csv')

In [29]:
df.rename({'default.payment.next.month' : 'Default'} ,axis=1 , inplace=True)
df.drop('ID' , axis=1 , inplace=True)

In [31]:
# Eliminamos nan. Tal vez no sea la mejor manera, pero son pocos
df = df.loc[(df['MARRIAGE']!=0) & (df['EDUCATION']!=0)]

In [33]:
# Vamos a entrenar con menos observaciones
from sklearn.utils import resample

df_default = df[df['Default']==1]
df_no_default = df[df['Default']==0]

df_default_downsampled = resample(df_default , replace=False , n_samples=1000 , random_state=42)
df_no_default_downsampled = resample(df_no_default , replace=False , n_samples=1000 , random_state=42)

df_downsampled = pd.concat([df_default_downsampled,df_no_default_downsampled])

In [35]:
X = df_downsampled.drop('Default' ,axis=1)
Y = df_downsampled['Default']

In [36]:
X_encoded = pd.get_dummies(X, columns = ['SEX' ,
                                         'EDUCATION' ,
                                         'MARRIAGE' ,
                                         'PAY_0',
                                         'PAY_2',
                                         'PAY_3',
                                         'PAY_4',
                                         'PAY_5',
                                         'PAY_6'])

In [42]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train, y_test = train_test_split(X_encoded , Y , random_state=42)

In [43]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

# A entrenar modelos

In [45]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train_scaled , y_train)

In [46]:
from sklearn.metrics import accuracy_score

y_pred_train = clf.predict(X_train_scaled)
y_pred_test = clf.predict(X_test_scaled)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.7433333333333333
Accuracy test: 0.726


In [53]:
clf=SVC(C=500 , gamma=0.001 ,kernel='rbf')
clf.fit(X_train_scaled , y_train)

In [54]:
y_pred_train = clf.predict(X_train_scaled)
y_pred_test = clf.predict(X_test_scaled)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.7773333333333333
Accuracy test: 0.688


# Probar Reducir la dimensionalidad

In [55]:
from sklearn.manifold import TSNE

tsne=TSNE(n_components=2, init="pca",learning_rate="auto", n_iter=500, n_iter_without_progress=150, n_jobs=2)
x_train_reduced=tsne.fit_transform(X_train_scaled)
x_test_reduced=tsne.fit_transform(X_test_scaled)

In [61]:
clf=SVC()
clf.fit(x_train_reduced , y_train)

In [62]:
y_pred_train = clf.predict(x_train_reduced)
y_pred_test = clf.predict(x_test_reduced)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.6833333333333333
Accuracy test: 0.672


# ¿Y un RF?

In [64]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 1.0
Accuracy test: 0.708


In [77]:
clf = RandomForestClassifier(n_estimators=80, max_depth=6)

clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.7753333333333333
Accuracy test: 0.704


In [85]:
clf = RandomForestClassifier(n_estimators=80,max_depth=11)

clf.fit(x_train_reduced, y_train)

y_pred_train = clf.predict(x_train_reduced)
y_pred_test = clf.predict(x_test_reduced)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.8973333333333333
Accuracy test: 0.632


# ¿Y un Catboost?

In [89]:
%%capture
!pip install catboost

In [90]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(silent=True)
clf = clf.fit(X_train, y_train)


y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print('Accuracy train:', accuracy_score(y_train, y_pred_train))
print('Accuracy test:', accuracy_score(y_test, y_pred_test))

Accuracy train: 0.8993333333333333
Accuracy test: 0.736
