In [70]:
#import all packages needed

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut,KFold
from sklearn.linear_model import LinearRegression
from tensorflow import keras

In [71]:
#import data
data = pd.read_csv('crx.data', sep=",")
print(data)

    A1     A2      A3 A4 A5  A6  A7    A8 A9 A10  A11 A12 A13    A14  A15 A16
0    b  30.83   0.000  u  g   w   v  1.25  t   t    1   f   g  00202    0   +
1    a  58.67   4.460  u  g   q   h  3.04  t   t    6   f   g  00043  560   +
2    a  24.50   0.500  u  g   q   h  1.50  t   f    0   f   g  00280  824   +
3    b  27.83   1.540  u  g   w   v  3.75  t   t    5   t   g  00100    3   +
4    b  20.17   5.625  u  g   w   v  1.71  t   f    0   f   s  00120    0   +
..  ..    ...     ... .. ..  ..  ..   ... ..  ..  ...  ..  ..    ...  ...  ..
685  b  21.08  10.085  y  p   e   h  1.25  f   f    0   f   g  00260    0   -
686  a  22.67   0.750  u  g   c   v  2.00  f   t    2   t   g  00200  394   -
687  a  25.25  13.500  y  p  ff  ff  2.00  f   t    1   t   g  00200    1   -
688  b  17.92   0.205  u  g  aa   v  0.04  f   f    0   f   g  00280  750   -
689  b  35.00   3.375  u  g   c   h  8.29  f   f    0   t   g  00000    0   -

[690 rows x 16 columns]


In [72]:
#Remove incomplete data
for series_name, series in data.items():
    #print(series_name)
    data.drop(data.loc[data[series_name] == "?"].index, inplace=True)
print(data)

    A1     A2      A3 A4 A5  A6  A7    A8 A9 A10  A11 A12 A13    A14  A15 A16
0    b  30.83   0.000  u  g   w   v  1.25  t   t    1   f   g  00202    0   +
1    a  58.67   4.460  u  g   q   h  3.04  t   t    6   f   g  00043  560   +
2    a  24.50   0.500  u  g   q   h  1.50  t   f    0   f   g  00280  824   +
3    b  27.83   1.540  u  g   w   v  3.75  t   t    5   t   g  00100    3   +
4    b  20.17   5.625  u  g   w   v  1.71  t   f    0   f   s  00120    0   +
..  ..    ...     ... .. ..  ..  ..   ... ..  ..  ...  ..  ..    ...  ...  ..
685  b  21.08  10.085  y  p   e   h  1.25  f   f    0   f   g  00260    0   -
686  a  22.67   0.750  u  g   c   v  2.00  f   t    2   t   g  00200  394   -
687  a  25.25  13.500  y  p  ff  ff  2.00  f   t    1   t   g  00200    1   -
688  b  17.92   0.205  u  g  aa   v  0.04  f   f    0   f   g  00280  750   -
689  b  35.00   3.375  u  g   c   h  8.29  f   f    0   t   g  00000    0   -

[653 rows x 16 columns]


In [73]:
#Preprocess the data
for index, row in data.iterrows():
    match data.at[index, "A1"]:
        case 'a':
            data.loc[index, "A1"] = 0
        case 'b':
            data.loc[index, "A1"] = 1
        case _:
            raise ValueError("Error")
        
    match data.at[index, "A4"]:
        case 'u':
            data.loc[index, "A4"] = 0
        case 'y':
            data.loc[index, "A4"] = 1
        case 'l':
            data.loc[index, "A4"] = 2
        case 't':
            data.loc[index, "A4"] = 3    
        case _:
            raise ValueError("Error")
        
    match data.at[index, "A5"]:
        case 'gg':
            data.loc[index, "A5"] = 2
        case 'p':
            data.loc[index, "A5"] = 1
        case 'g':
            data.loc[index, "A5"] = 0
        case _:
            raise ValueError("Error")
        
    match data.at[index, "A6"]:
        case 'c':
            data.loc[index, "A6"] = 0
        case 'd':
            data.loc[index, "A6"] = 1
        case 'cc':
            data.loc[index, "A6"] = 2
        case 'i':
            data.loc[index, "A6"] = 3
        case 'j':
            data.loc[index, "A6"] = 4
        case 'k':
            data.loc[index, "A6"] = 5
        case 'm':
            data.loc[index, "A6"] = 6
        case 'r':
            data.loc[index, "A6"] = 7   
        case 'q':
            data.loc[index, "A6"] = 8
        case 'w':
            data.loc[index, "A6"] = 9
        case 'x':
            data.loc[index, "A6"] = 10
        case 'e':
            data.loc[index, "A6"] = 11
        case 'aa':
            data.loc[index, "A6"] = 12
        case 'ff':
            data.loc[index, "A6"] = 13    
        case _:
            raise ValueError("Error")
        
    match data.at[index, "A7"]:
        case 'v':
            data.loc[index, "A7"] = 0
        case 'h':
            data.loc[index, "A7"] = 1
        case 'bb':
            data.loc[index, "A7"] = 2
        case 'j':
            data.loc[index, "A7"] = 3
        case 'n':
            data.loc[index, "A7"] = 4
        case 'z':
            data.loc[index, "A7"] = 5
        case 'dd':
            data.loc[index, "A7"] = 6
        case 'ff':
            data.loc[index, "A7"] = 7   
        case 'o':
            data.loc[index, "A7"] = 8
        case _:
            raise ValueError("Error")

    match data.at[index, "A9"]:
        case 't':
            data.loc[index, "A9"] = 0
        case 'f':
            data.loc[index, "A9"] = 1
        case _:
            raise ValueError("Error")
        
    match data.at[index, "A10"]:
        case 't':
            data.loc[index, "A10"] = 0
        case 'f':
            data.loc[index, "A10"] = 1
        case _:
            raise ValueError("Error")
        
    match data.at[index, "A12"]:
        case 't':
            data.loc[index, "A12"] = 0
        case 'f':
            data.loc[index, "A12"] = 1
        case _:
            raise ValueError("Error")
        
    match data.at[index, "A13"]:
        case 'g':
            data.loc[index, "A13"] = 0
        case 'p':
            data.loc[index, "A13"] = 1
        case 's':
            data.loc[index, "A13"] = 2
        case _:
            raise ValueError("Error")
    
    match data.at[index, "A16"]:
        case '+':
            data.loc[index, "A16"] = 1
        case '-':
            data.loc[index, "A16"] = 0
        
data = data.astype({'A1': 'int32', 'A2': 'float64','A3': 'int32', 'A4': 'int32', 'A5': 'int32', 
                                  'A6': 'int32', 'A7': 'int32', 'A9': 'int32', 'A10': 'int32', 'A11': 'int32', 
                                  'A12': 'int32', 'A13': 'int32', 'A14': 'int32', 'A16': 'boolean'})

print(data)
print(data.dtypes)

     A1     A2  A3  A4  A5  A6  A7    A8  A9  A10  A11  A12  A13  A14  A15  \
0     1  30.83   0   0   0   9   0  1.25   0    0    1    1    0  202    0   
1     0  58.67   4   0   0   8   1  3.04   0    0    6    1    0   43  560   
2     0  24.50   0   0   0   8   1  1.50   0    1    0    1    0  280  824   
3     1  27.83   1   0   0   9   0  3.75   0    0    5    0    0  100    3   
4     1  20.17   5   0   0   9   0  1.71   0    1    0    1    2  120    0   
..   ..    ...  ..  ..  ..  ..  ..   ...  ..  ...  ...  ...  ...  ...  ...   
685   1  21.08  10   1   1  11   1  1.25   1    1    0    1    0  260    0   
686   0  22.67   0   0   0   0   0  2.00   1    0    2    0    0  200  394   
687   0  25.25  13   1   1  13   7  2.00   1    0    1    0    0  200    1   
688   1  17.92   0   0   0  12   0  0.04   1    1    0    1    0  280  750   
689   1  35.00   3   0   0   0   1  8.29   1    1    0    0    0    0    0   

       A16  
0     True  
1     True  
2     True  
3     True 

In [74]:
#Split the data train vs val
credit_data = data.iloc[:, 0:15]
credit_approval = data["A16"]

credit_data_folds_indexs = []
credit_approval_folds_indexs = []

kFold = KFold(shuffle=True)
kFold.get_n_splits(credit_data)
for i, (train_index, test_index) in enumerate(kFold.split(credit_data)):
    credit_data_folds_indexs.append(train_index)
    credit_approval_folds_indexs.append(test_index)

credit_data_folds = []
credit_approval_folds = []

for fold_num in range(len(credit_approval_folds_indexs)):
    credit_data_fold = pd.DataFrame(columns=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15'])
    for i in credit_data_folds_indexs[fold_num]:
        credit_data_fold.loc[-1] = credit_data.iloc[i]
        credit_data_fold.index += 1
        credit_data_fold = credit_data_fold.sort_index()
    credit_data_folds.append(credit_data_fold)

    credit_approval_fold = pd.DataFrame(columns=['A16'])
    for i in credit_approval_folds_indexs[fold_num]:
        credit_approval_fold.loc[-1] = credit_approval.iloc[i]
        credit_approval_fold.index += 1
        credit_approval_fold = credit_approval_fold.sort_index()
    credit_approval_folds.append(credit_approval_fold)

[      A1     A2    A3   A4   A5    A6   A7     A8   A9  A10  A11  A12  A13  \
0    0.0  25.25  13.0  1.0  1.0  13.0  7.0  2.000  1.0  0.0  1.0  0.0  0.0   
1    0.0  22.67   0.0  0.0  0.0   0.0  0.0  2.000  1.0  0.0  2.0  0.0  0.0   
2    1.0  40.58   3.0  0.0  0.0   6.0  0.0  3.500  1.0  1.0  0.0  0.0  2.0   
3    1.0  36.42   0.0  1.0  1.0   1.0  0.0  0.585  1.0  1.0  0.0  1.0  0.0   
4    1.0  27.83   1.0  1.0  1.0   1.0  1.0  3.000  1.0  1.0  0.0  1.0  0.0   
..   ...    ...   ...  ...  ...   ...  ...    ...  ...  ...  ...  ...  ...   
517  1.0  20.17   5.0  0.0  0.0   9.0  0.0  1.710  0.0  1.0  0.0  1.0  2.0   
518  1.0  27.83   1.0  0.0  0.0   9.0  0.0  3.750  0.0  0.0  5.0  0.0  0.0   
519  0.0  24.50   0.0  0.0  0.0   8.0  1.0  1.500  0.0  1.0  0.0  1.0  0.0   
520  0.0  58.67   4.0  0.0  0.0   8.0  1.0  3.040  0.0  0.0  6.0  1.0  0.0   
521  1.0  30.83   0.0  0.0  0.0   9.0  0.0  1.250  0.0  0.0  1.0  1.0  0.0   

       A14    A15  
0    200.0    1.0  
1    200.0  394.0  
2 

In [75]:
#Build the model



In [76]:
#Train the model