In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("credit_train.csv")

In [3]:
numeric_col = ['Gender', 'Own_car', 'Own_property', 'Work_phone', 'Phone', 'Email', 'Unemployed', 'Num_children', 'Num_family', 'Account_length', 'Total_income', 'Age', 'Years_employed', 'Income_type', 'Education_type', 'Family_status', 'Housing_type', 'Occupation_type', 'Target']
corr = dataset.loc[:,numeric_col].corr()
print(corr)

                  Gender   Own_car  Own_property  Work_phone     Phone  \
Gender          1.000000  0.370909     -0.059527    0.064589 -0.018928   
Own_car         0.370909  1.000000     -0.014759    0.032819  0.012867   
Own_property   -0.059527 -0.014759      1.000000   -0.195787 -0.049429   
Work_phone      0.064589  0.032819     -0.195787    1.000000  0.278749   
Phone          -0.018928  0.012867     -0.049429    0.278749  1.000000   
Email           0.002085  0.013726      0.058535   -0.040322  0.012881   
Unemployed     -0.165863 -0.141824      0.096055   -0.241463 -0.002168   
Num_children    0.052749  0.092994     -0.007515    0.057026 -0.023793   
Num_family      0.077156  0.137278     -0.001529    0.068694 -0.013760   
Account_length -0.006500  0.027380     -0.003275    0.006799  0.015533   
Total_income    0.195058  0.227127      0.018070   -0.031986  0.025089   
Age            -0.174751 -0.131240      0.145752   -0.177178  0.040721   
Years_employed -0.039609  0.009837    

In [4]:
dataset.shape

(7767, 20)

In [5]:
dataset.head()

Unnamed: 0,id,Gender,Own_car,Own_property,Work_phone,Phone,Email,Unemployed,Num_children,Num_family,Account_length,Total_income,Age,Years_employed,Income_type,Education_type,Family_status,Housing_type,Occupation_type,Target
0,0,0,0,1,0,0,0,0,1,3,10,238500.0,33.731014,10.185014,Working,Secondary / secondary special,Married,House / apartment,Core staff,0
1,1,0,0,1,0,0,0,0,1,3,9,157500.0,43.149414,5.087031,Working,Secondary / secondary special,Married,Municipal apartment,Sales staff,1
2,2,0,0,1,0,0,1,1,0,1,11,67500.0,64.871969,0.0,Pensioner,Higher education,Single / not married,Municipal apartment,Other,0
3,3,1,0,1,0,0,0,0,0,1,1,148500.0,32.359323,2.633867,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,0
4,4,1,1,1,0,0,0,0,0,2,48,225000.0,36.833063,11.173398,Commercial associate,Higher education,Married,House / apartment,Laborers,0


In [6]:
dataset['Income_type'], uniques_It = pd.factorize(dataset['Income_type'])
dataset['Education_type'], uniques_Et = pd.factorize(dataset['Education_type'])
dataset['Family_status'],uniques_Fs = pd.factorize(dataset['Family_status'])
dataset['Housing_type'], uniques_Ht = pd.factorize(dataset['Housing_type'])
dataset['Occupation_type'], uniques_Ot = pd.factorize(dataset['Occupation_type'])

In [7]:
uniques_It

Index(['Working', 'Pensioner', 'Commercial associate', 'State servant',
       'Student'],
      dtype='object')

In [8]:
X = dataset.drop('Target', axis=1).drop('id', axis=1)
y = dataset['Target']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [10]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier()

In [11]:
y_pred = classifier.predict(X_test)

In [12]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='weighted')

0.7605182402479701

In [13]:
from sklearn.model_selection import GridSearchCV
GSCV = GridSearchCV( classifier, {'max_depth': range(3,10), 'min_samples_leaf': range(1,10), 'min_samples_split': range(2, 10)}, cv = 3, scoring = 'neg_mean_absolute_error')
GSCV.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(3, 10),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10)},
             scoring='neg_mean_absolute_error')

In [14]:
GSCV.best_estimator_

DecisionTreeClassifier(max_depth=3, min_samples_leaf=8)

In [15]:
classifier_1 = DecisionTreeClassifier(max_depth=4, min_samples_leaf=6)
classifier_1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, min_samples_leaf=6)

In [16]:
y_pred_1 = classifier_1.predict(X_test)

In [17]:
f1_score(y_test, y_pred_1, average='weighted')

0.8043800871794743

In [18]:
import warnings
warnings.filterwarnings("ignore")

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_1))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1343
           1       1.00      0.01      0.02       211

    accuracy                           0.87      1554
   macro avg       0.93      0.50      0.47      1554
weighted avg       0.88      0.87      0.80      1554



In [20]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [21]:
y_pred_2 = model.predict(X_test)

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93      1343
           1       0.33      0.00      0.01       211

    accuracy                           0.86      1554
   macro avg       0.60      0.50      0.47      1554
weighted avg       0.79      0.86      0.80      1554



In [23]:
from sklearn.model_selection import GridSearchCV
GSCV = GridSearchCV( model, {'max_depth': range(3,10), 'min_samples_leaf': range(1,10), 'min_samples_split': range(2, 10)}, cv = 3, scoring = 'neg_mean_absolute_error')
GSCV.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': range(3, 10),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10)},
             scoring='neg_mean_absolute_error')

In [24]:
GSCV.best_estimator_

RandomForestClassifier(max_depth=3)

In [25]:
from sklearn.ensemble import RandomForestClassifier
model_1 = RandomForestClassifier(max_depth=5, min_samples_split=7)
model_1.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_split=7)

In [26]:
y_pred_3 = model.predict(X_test)

In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_3))

              precision    recall  f1-score   support

           0       0.86      1.00      0.93      1343
           1       0.33      0.00      0.01       211

    accuracy                           0.86      1554
   macro avg       0.60      0.50      0.47      1554
weighted avg       0.79      0.86      0.80      1554



In [28]:
f1_score(y_test, y_pred_3, average='micro')

0.8635778635778636

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_3)

0.8635778635778636

In [30]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [31]:
bdt = BaggingRegressor(DecisionTreeClassifier()).fit(X_train, y_train)
bdt_predict = bdt.predict(X_test)

In [32]:
for i in range(len(bdt_predict)):
    if bdt_predict[i] <= 0.9:
        bdt_predict[i] = 0
    else:
        bdt_predict[i] = 1
accuracy_score(y_test, bdt_predict)

0.8642213642213642

In [33]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

def pipeline_types():
    numeric_features = [
        'Gender', 
        'Own_car', 
        'Own_property',
        'Work_phone',
        'Phone', 
        'Email',
        'Unemployed', 
        'Num_children', 
        'Num_family',
        'Account_length',
        'Age',
        'Years_employed',
    ]
    categorical_features = [
        'Income_type',
        'Education_type',
        'Family_status',
        'Housing_type',
        'Occupation_type',
    ]

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    transformer = ColumnTransformer(transformers=[
        ('numeric_data_preprocessing', numeric_transformer, numeric_features),
        ('categorical_data_preprocessing', categorical_transformer, categorical_features)
    ])
    return Pipeline(steps=[
        ('transformer', transformer),
        ('rf_estimator', RandomForestClassifier()),
    ])

In [34]:
dataset_1 = pd.read_csv("credit_train.csv")

In [35]:
X_d1 = dataset_1.drop('Target', axis=1).drop('id', axis=1)
y_d1 = dataset_1['Target']

In [36]:
X_train_d1, X_test_d1, y_train_d1, y_test_d1 = train_test_split(X_d1, y_d1, test_size=0.20)

In [37]:
pipeline = pipeline_types()

pipeline.fit(X_train_d1, y_train_d1)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('numeric_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent'))]),
                                                  ['Gender', 'Own_car',
                                                   'Own_property', 'Work_phone',
                                                   'Phone', 'Email',
                                                   'Unemployed', 'Num_children',
                                                   'Num_family',
                                                   'Account_length', 'Age',
                                                   'Years_employed']),
                                                 ('categorical_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                      

In [38]:
y_pred_s1 = model.predict(X_test_d1)

ValueError: could not convert string to float: 'State servant'

In [None]:
print(classification_report(y_test_d1, y_pred_s1))

In [None]:
f1_score(y_test_d1, y_pred_s1, average='micro')

In [None]:
test_input = pd.read_csv("credit_test.csv")

prediction = pipeline.predict(test_input)

result = pd.DataFrame()

result['Target'] = prediction
result.to_csv('result.csv')

In [None]:
prediction