# ÖDEV 1: PCA yardımı ile Classification,

Bu ödevde "Credit Risk Prediction" veri setini kullanacağız. Amacımız, verinin boyut sayısını düşürerek olabildiğince yüksek accuracy değerini alabilmek. Aşağıda verinin okunma ve temizlenme kısmını hazırlayıp vereceğim. Devamında ise yapmanız gerekenler:

1. PCA kullanarak verinin boyutunu düşürmek
    * Önce explained varience ratio değerini inceleyerek veriyi kaç boyuta düşürebileceğini kontrol et.
    * Daha sonra farklı boyutlarda denemeler yaparak boyutu düşürülmüş verileri elde et.
2. Classification modellerini dene
    * Logistic Regression
    * Random Forest
    * ve eğer istersen herhangi bir modelle daha

İsteğe bağlı olarak, verinin boyutunu düşürmek için diğer yöntemleri de kullanıp en yüksek accuracy değerini almayı deneyebilirsin.

In [210]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [211]:
df: pd.DataFrame = pd.read_csv('./credit_risk_dataset.csv')

In [212]:
print(df.isnull().sum())

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64


In [None]:
# Null değerleri sütun ortalaması ile dolduruyoruz
df["person_emp_length"].fillna(df["person_emp_length"].median(), inplace=True)
df["loan_int_rate"].fillna(df["loan_int_rate"].median(), inplace=True)

In [None]:
df.duplicated().sum()

165

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
person_age,32416.0,27.747008,6.3541,20.0,23.0,26.0,30.0,144.0
person_income,32416.0,66091.640826,62015.580269,4000.0,38542.0,55000.0,79218.0,6000000.0
person_emp_length,32416.0,4.76888,4.090411,0.0,2.0,4.0,7.0,123.0
loan_amnt,32416.0,9593.845632,6322.730241,500.0,5000.0,8000.0,12250.0,35000.0
loan_int_rate,32416.0,11.014662,3.08305,5.42,8.49,10.99,13.11,23.22
loan_status,32416.0,0.218688,0.413363,0.0,0.0,0.0,0.0,1.0
loan_percent_income,32416.0,0.17025,0.106812,0.0,0.09,0.15,0.23,0.83
cb_person_cred_hist_length,32416.0,5.811297,4.05903,2.0,3.0,4.0,8.0,30.0


In [None]:
# Outlier temizliği
df = df[df['person_age']<=100]
df = df[df['person_emp_length'] <= 60]
df = df[df['person_income']<=4e6]

In [None]:
# Kategorik verileri alıyoruz ve one hot encoding haline getiriyoruz
cat_cols = pd.DataFrame(df[df.select_dtypes(include=['object']).columns])
cat_cols.columns

Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')

In [None]:
encoded_cat_cols = pd.get_dummies(cat_cols)
df.drop(df.select_dtypes(include=['object']).columns, axis=1,inplace=True)
df = pd.concat([df,encoded_cat_cols], axis=1)

In [None]:
X = df.drop('loan_status', axis=1).values
y = df['loan_status'].values

In [None]:
# Verileri train ve test olarak ikiye ayırıyoruz

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(1, test_size=0.1)
train_idx, test_idx = next(split.split(X, y))
train_x = X[train_idx]
test_x = X[test_idx]

train_y = y[train_idx]
test_y = y[test_idx]

## Kolay gelsin!

In [213]:
X.shape

(32409, 26)

In [256]:
from sklearn.decomposition import PCA
n_components = 26
pca = PCA(n_components=n_components)
components = pca.fit_transform(X)

In [257]:
pca.components_

array([[ 1.66046783e-05,  9.99254947e-01,  1.17780504e-05,
         3.85946944e-02, -1.81909023e-08, -5.92622383e-07,
         9.47124176e-06,  2.21338548e-06,  1.26873388e-08,
        -2.30515410e-07, -1.99555741e-06,  3.53149822e-08,
        -1.28190361e-07,  3.09339114e-07, -2.98771873e-07,
         5.20145246e-08,  3.02936123e-08,  7.87708060e-08,
         5.07886534e-08, -1.36190731e-07, -8.65060290e-08,
         5.47225291e-08,  3.04328342e-08,  7.98193767e-09,
         1.69441192e-08, -1.69441192e-08],
       [-7.17257027e-06,  3.85946949e-02, -4.35518368e-05,
        -9.99254943e-01, -7.58994580e-05, -1.25380757e-05,
        -1.98395072e-06, -4.79834427e-06, -9.32068099e-08,
         5.52711838e-07,  4.33883924e-06,  4.78966759e-08,
         2.45336416e-07, -1.46028334e-06,  7.80070198e-07,
         2.33073062e-07,  1.53906986e-07,  9.88202251e-06,
        -3.40607609e-06,  1.65482330e-06, -4.16960792e-06,
        -2.59411711e-06, -9.72127098e-07, -3.94917597e-07,
         2.68

In [258]:
components.shape

(32409, 26)

In [259]:
pca.explained_variance_ratio_ #Explanined varience ratio, her bir componentin bize ne kadar bilgi verdiğini gösterir..

array([9.87172378e-01, 1.28275928e-02, 1.84908111e-08, 5.38665689e-09,
       3.36822929e-09, 9.91999801e-10, 1.45298012e-10, 1.26578503e-10,
       6.87390238e-11, 6.80322915e-11, 6.49009692e-11, 6.20949120e-11,
       5.91916321e-11, 5.05450070e-11, 4.17747291e-11, 3.70992460e-11,
       2.14124883e-11, 7.92954529e-12, 2.74169803e-12, 1.63947111e-12,
       1.54524841e-12, 7.78274858e-13, 9.66102379e-33, 9.66102379e-33,
       9.66102379e-33, 9.66102379e-33])

In [260]:
1 - pca.explained_variance_ratio_.sum() #Aradaki veri kaybını hesaplamak için 1 den çıkarıp kaybettiğimiz veri oranını gördük..

-2.220446049250313e-16

In [None]:
# n_component değerini boş bırakırsak..
pca = PCA()
pca.fit(X)
pca.components_

In [244]:
pca.explained_variance_ratio_

array([9.87172378e-01, 1.28275928e-02, 1.84908111e-08, 5.38665689e-09,
       3.36822929e-09, 9.91999801e-10, 1.45298012e-10, 1.26578503e-10,
       6.87390238e-11, 6.80322915e-11, 6.49009692e-11, 6.20949120e-11,
       5.91916321e-11, 5.05450070e-11, 4.17747291e-11, 3.70992460e-11,
       2.14124883e-11, 7.92954529e-12, 2.74169803e-12, 1.63947111e-12,
       1.54524841e-12, 7.78274858e-13, 9.66102379e-33, 9.66102379e-33,
       9.66102379e-33, 9.66102379e-33])

In [262]:
np.cumsum(pca.explained_variance_ratio_)

array([0.98717238, 0.99999997, 0.99999999, 0.99999999, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        ])

In [263]:
pca = PCA(n_components=0.99)
pca.fit(X)
pca.components_

array([[ 1.66046783e-05,  9.99254947e-01,  1.17780504e-05,
         3.85946944e-02, -1.81909023e-08, -5.92622383e-07,
         9.47124176e-06,  2.21338548e-06,  1.26873388e-08,
        -2.30515410e-07, -1.99555741e-06,  3.53149822e-08,
        -1.28190361e-07,  3.09339114e-07, -2.98771873e-07,
         5.20145246e-08,  3.02936123e-08,  7.87708060e-08,
         5.07886534e-08, -1.36190731e-07, -8.65060290e-08,
         5.47225291e-08,  3.04328342e-08,  7.98193767e-09,
         1.69441192e-08, -1.69441192e-08],
       [-7.17257027e-06,  3.85946949e-02, -4.35518368e-05,
        -9.99254943e-01, -7.58994580e-05, -1.25380757e-05,
        -1.98395072e-06, -4.79834427e-06, -9.32068099e-08,
         5.52711838e-07,  4.33883924e-06,  4.78966759e-08,
         2.45336416e-07, -1.46028334e-06,  7.80070198e-07,
         2.33073062e-07,  1.53906986e-07,  9.88202251e-06,
        -3.40607609e-06,  1.65482330e-06, -4.16960792e-06,
        -2.59411711e-06, -9.72127098e-07, -3.94917597e-07,
         2.68

In [264]:
pca.explained_variance_ratio_

array([0.98717238, 0.01282759])

In [265]:
1 - pca.explained_variance_ratio_.sum()

2.89979980028221e-08

In [266]:
pca = PCA(n_components=0.2)
pca.fit(X)
pca.components_

array([[ 1.66046783e-05,  9.99254947e-01,  1.17780504e-05,
         3.85946944e-02, -1.81909023e-08, -5.92622383e-07,
         9.47124176e-06,  2.21338548e-06,  1.26873388e-08,
        -2.30515410e-07, -1.99555741e-06,  3.53149822e-08,
        -1.28190361e-07,  3.09339114e-07, -2.98771873e-07,
         5.20145246e-08,  3.02936123e-08,  7.87708060e-08,
         5.07886534e-08, -1.36190731e-07, -8.65060290e-08,
         5.47225291e-08,  3.04328342e-08,  7.98193767e-09,
         1.69441192e-08, -1.69441192e-08]])

In [267]:
pca.explained_variance_ratio_

array([0.98717238])

In [268]:
1 - pca.explained_variance_ratio_.sum()

0.012827621799865185

In [287]:
train_x = pca.fit_transform(train_x)
test_x = pca.fit_transform(test_x)

In [288]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [289]:
logreg = LogisticRegression()
logreg.fit(train_x,train_y)

LogisticRegression()

In [290]:
pred_y = pd.Series(logreg.predict(test_x))


In [291]:
accuracy_score(test_y, pred_y)

0.41808083924714595

In [297]:
from sklearn.ensemble import RandomForestClassifier
rF_clf=RandomForestClassifier()
rF_clf.fit(train_x, train_y)

RandomForestClassifier()

In [298]:
pred_y = rF_clf.predict(test_x)

In [299]:
accuracy_score(test_y, pred_y)

0.7497685899413761