# PRiAD - Projekt zespołowy "Klasyfikacja wniosków kredytowych"

---

In [188]:
# Import potrzebnych bibliotek
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Zmiana sposobu wyświetlania danych zmiennoprzecinkowych
pd.options.display.float_format = "{:.2f}".format 

In [189]:
# Przydatne funkcje

def setProperColumnNames(data):
    data = data.rename(
        columns = {
            0:'A1', 1:'A2', 2:'A3', 3:'A4', 4:'A5', 5:'A6', 6:'A7', 7:'A8', 8:'A9',
            9:'A10', 10:'A11', 11:'A12', 12:'A13', 13:'A14', 14:'A15', 15:'A16'
        }
    )
    
    return data

def getErrorMatrix(data, testResult):
    return pd.crosstab(data['decisiveSetToTest'], testResult) 


def getTotalErrors(errorMatrix):
    totalErrors = 0

    # Errors above diagonal
    for row in range(0, errorMatrix.shape[0]):
        for column in range(row + 1, errorMatrix.shape[1]):
            totalErrors += errorMatrix.iloc[row, column]

    # Errors below diagonal
    for column in range(0, errorMatrix.shape[1]):
        for row in range(column + 1, errorMatrix.shape[0]):
            totalErrors += errorMatrix.iloc[row, column]

    return totalErrors


In [190]:
# Wczytanie i przygotowanie ramki danych do pracy

# Konwersja .data --> .csv do typu Data Frame
data = pd.read_csv('crx.data', header = None, sep = ',')
data = data.dropna()
data = setProperColumnNames(data)

# Porzucenie wierszy z podziurawionymi danymi
for column in data.columns:
    data = data[data[column] != '?']

data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.00,u,g,w,v,1.25,t,t,1,f,g,00202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,00043,560,+
2,a,24.50,0.50,u,g,q,h,1.50,t,f,0,f,g,00280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,00100,3,+
4,b,20.17,5.62,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.09,y,p,e,h,1.25,f,f,0,f,g,00260,0,-
686,a,22.67,0.75,u,g,c,v,2.00,f,t,2,t,g,00200,394,-
687,a,25.25,13.50,y,p,ff,ff,2.00,f,t,1,t,g,00200,1,-
688,b,17.92,0.20,u,g,aa,v,0.04,f,f,0,f,g,00280,750,-


In [191]:
# Zakodowanie danych
columnsToEncodeIndexes = [1, 4, 5, 6, 7, 9, 10, 12, 13, 16]
for index in columnsToEncodeIndexes:
    data['A' + str(index)] = data['A' + str(index)].astype('category').cat.codes

data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,30.83,0.00,1,0,12,7,1.25,1,1,1,0,0,00202,0,0
1,0,58.67,4.46,1,0,10,3,3.04,1,1,6,0,0,00043,560,0
2,0,24.50,0.50,1,0,10,3,1.50,1,0,0,0,0,00280,824,0
3,1,27.83,1.54,1,0,12,7,3.75,1,1,5,1,0,00100,3,0
4,1,20.17,5.62,1,0,12,7,1.71,1,0,0,0,2,00120,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21.08,10.09,2,2,4,3,1.25,0,0,0,0,0,00260,0,1
686,0,22.67,0.75,1,0,1,7,2.00,0,1,2,1,0,00200,394,1
687,0,25.25,13.50,2,2,5,2,2.00,0,1,1,1,0,00200,1,1
688,1,17.92,0.20,1,0,0,7,0.04,0,0,0,0,0,00280,750,1


In [192]:
# Standaryzacja danych
sc = StandardScaler()

data = sc.fit_transform(data)
data = pd.DataFrame(data)  
data = setProperColumnNames(data)

data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,0.67,-0.06,-0.96,-0.54,-0.55,1.45,0.77,-0.30,0.93,1.13,-0.30,-0.93,-0.30,0.13,-0.19,-1.10
1,-1.49,2.30,-0.07,-0.54,-0.55,0.98,-0.84,0.24,0.93,1.13,0.70,-0.93,-0.30,-0.82,-0.09,-1.10
2,-1.49,-0.59,-0.86,-0.54,-0.55,0.98,-0.84,-0.22,0.93,-0.89,-0.50,-0.93,-0.30,0.59,-0.04,-1.10
3,0.67,-0.31,-0.65,-0.54,-0.55,1.45,0.77,0.45,0.93,1.13,0.50,1.08,-0.30,-0.48,-0.19,-1.10
4,0.67,-0.96,0.16,-0.54,-0.55,1.45,0.77,-0.16,0.93,-0.89,-0.50,-0.93,3.35,-0.36,-0.19,-1.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648,0.67,-0.88,1.05,1.80,1.81,-0.41,-0.84,-0.30,-1.07,-0.89,-0.50,-0.93,-0.30,0.47,-0.19,0.91
649,-1.49,-0.75,-0.81,-0.54,-0.55,-1.11,0.77,-0.07,-1.07,1.13,-0.10,1.08,-0.30,0.12,-0.12,0.91
650,-1.49,-0.53,1.73,1.80,1.81,-0.18,-1.24,-0.07,-1.07,1.13,-0.30,1.08,-0.30,0.12,-0.19,0.91
651,0.67,-1.15,-0.92,-0.54,-0.55,-1.34,0.77,-0.65,-1.07,-0.89,-0.50,-0.93,-0.30,0.59,-0.05,0.91


In [193]:
# Podział na zbiór testowy i uczący
proportion = 0.15

descriptiveSetToTeach, descriptiveSetToTest, decisiveSetToTeach, decisiveSetToTest = train_test_split(
    data.iloc[:,0:-1], 
    data.iloc[:,-1].astype('category').cat.codes, 
    test_size = proportion
)

dividedData = {
    'descriptiveSetToTeach' : descriptiveSetToTeach, 
    'descriptiveSetToTest' : descriptiveSetToTest, 
    'decisiveSetToTeach' : decisiveSetToTeach, 
    'decisiveSetToTest' : decisiveSetToTest
}


In [194]:
# Zbudowanie klasyfikatora
descAttributesIndexesRange = range(0, 15)

classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(
    dividedData['descriptiveSetToTeach'].iloc[:, descAttributesIndexesRange], 
    dividedData['decisiveSetToTeach']
)
y_pred = classifier.predict(dividedData['descriptiveSetToTest'])


In [195]:
# Wykonanie klasyfikacji
testResult = classifier.predict(
    dividedData['descriptiveSetToTest'].iloc[:, descAttributesIndexesRange]
)


In [196]:
# Wyświetlenie wyniku pracy klasyfikatora
testErrorMatrix = getErrorMatrix(dividedData, testResult)

print('Mistakes matrix for test set (0 = \'+\', 1 = \'-\'):')
print(testErrorMatrix)

totalErrors = getTotalErrors(testErrorMatrix)
print('Number of mistakes: %d' % totalErrors)

accuracy = accuracy_score(dividedData['decisiveSetToTest'], y_pred)
print('Accuracy: ', accuracy)


Mistakes matrix for test set (0 = '+', 1 = '-'):
col_0   0   1
row_0        
0      42   7
1       4  45
Number of mistakes: 11
Accuracy:  0.8877551020408163


In [197]:
# Histogramy wszystkich kolumn
