In [2]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install datetime
!pip install sklearn
!pip install seaborn
!pip install scikit-learn

Collecting pandas
  Downloading pandas-2.1.4-cp39-cp39-macosx_10_9_x86_64.whl.metadata (18 kB)
Collecting numpy<2,>=1.22.4 (from pandas)
  Downloading numpy-1.26.3-cp39-cp39-macosx_10_9_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.4-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.1.4-cp39-cp39-macosx_10_9_x86_64.whl (11.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading numpy-1.26.3-cp39-cp39-macosx_10_9_x86_64.whl (20.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.6/20.6 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached pytz-2023.3.pos

In [3]:
%matplotlib inline

import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import datetime


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import sklearn.metrics
from sklearn.metrics import roc_auc_score


import warnings
warnings.filterwarnings('ignore')

Load training and test data

In [12]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train.head()

Unnamed: 0,Deal_id,Deal_date,First_deal_date,Secret_dwarf_info_1,Secret_dwarf_info_2,Secret_dwarf_info_3,First_default_date,Successful_deals_count,Region,Tavern,Hashed_deal_detail_1,Hashed_deal_detail_2,Hashed_deal_detail_3,Hashed_deal_detail_4,Hashed_deal_detail_5,Hashed_deal_detail_6,Age,Gender,Default
0,22487461,2015-11-05,2015-08-29,,,,,0.0,Tavern_district_3,7,2.5,-3,8,2.5,-3,5,36.0,Male,0
1,62494261,2016-08-26,2015-12-21,3.5,-2.0,5.0,2016-07-30,2.0,Tavern_district_4,7,2.5,-3,14,3.5,-3,5,29.0,Female,1
2,34822849,2016-02-18,2015-11-11,,,,,0.0,Tavern_district_6,7,2.5,-3,8,2.5,-3,5,56.0,Female,0
3,46893387,2016-04-30,2016-03-22,,,,,0.0,Tavern_district_2,13,2.5,-2,5,2.5,-3,5,27.0,Female,0
4,67128275,2016-09-19,2016-07-21,,,,,0.0,Tavern_district_4,39,2.5,-3,7,2.5,-3,5,37.0,Female,0


### Data preparation

In [13]:
### Let's write a function that fills in the gaps in the data

def fill_missing_data(data):
    
### Hashed information on gnomes - mean
    for feature in [1, 2, 3]:
        col_name = f"Secret_dwarf_info_{feature}"
        mean_ = data[col_name].mean()
        data[col_name] = data[col_name].fillna(mean_)
        
### region - mode  
    mode_region_value = data.Region.mode()[0]
    data['Region'] = data['Region'].fillna(mode_region_value)
    
### The first default date can be filled, for example, with some kind of “outlier” 
### (in order to separate further in the feature space objects with a blank in this column)
    min_date_str = (
        data
        .First_default_date
        .dropna()
        .min()
    )
    
    min_date = datetime.datetime.strptime(
        min_date_str,
        '%Y-%m-%d'
    )
    
    date_for_missing_values = datetime.datetime(2015, 5, 1, 0, 0) - \
                              datetime.timedelta(days=365)
    
    date_ = str(date_for_missing_values)[:10]
    
    data['First_default_date'] = (
        data['First_default_date']
        .fillna(date_)
    )
    
### Successful deals - 0
    data['Successful_deals_count'] = (
        data['Successful_deals_count']
        .fillna(0)
    )
    
    return 

Let's apply the function on training and test data and make sure there are no gaps in the data

In [14]:
fill_missing_data(df_train)
fill_missing_data(df_test)


df_train.isna().sum().sum(), df_test.isna().sum().sum()

(0, 0)

Let's write a function that generates new features based on the basic ones

In [15]:
def create_new_features(data):
    
    ### Let's parse dates into year-month-day  
    data["First_deal_date"] = pd.to_datetime(data["First_deal_date"])
    data['First_deal_year']= data['First_deal_date'].apply(lambda x: x.year)
    data['First_deal_month'] = data['First_deal_date'].apply(lambda x: x.month)
    data['First_deal_day'] = data['First_deal_date'].apply(lambda x: x.day)

    data["Deal_date"] = pd.to_datetime(data["Deal_date"])
    data['Deal_year']= data['Deal_date'].apply(lambda x: x.year)
    data['Deal_month'] = data['Deal_date'].apply(lambda x: x.month)
    data['Deal_day'] = data['Deal_date'].apply(lambda x: x.day)

    data["First_default_date"] = pd.to_datetime(data["First_default_date"])
    data['First_default_year']= data['First_default_date'].apply(lambda x: x.year)
    data['First_default_month'] = data['First_default_date'].apply(lambda x: x.month)
    data['First_default_day'] = data['First_default_date'].apply(lambda x: x.day)
    
    ### Let’s create the feature “time from the first transaction to the first delay” in days
    
    data['Difference'] = (data['First_default_date'] - data['First_deal_date']).dt.days
    
    ### Delete old columns with date
    data.drop(
        [
            'First_deal_date',
            'Deal_date',
            'First_default_date'
        ],
        axis=1,
        inplace=True
    )
    
    return

Let's apply the function and make sure that everything is transformed exactly like this:
as we planned

In [16]:
create_new_features(df_train)
create_new_features(df_test)

df_train.head()

Unnamed: 0,Deal_id,Secret_dwarf_info_1,Secret_dwarf_info_2,Secret_dwarf_info_3,Successful_deals_count,Region,Tavern,Hashed_deal_detail_1,Hashed_deal_detail_2,Hashed_deal_detail_3,Hashed_deal_detail_4,Hashed_deal_detail_5,Hashed_deal_detail_6,Age,Gender,Default,First_deal_year,First_deal_month,First_deal_day,Deal_year,Deal_month,Deal_day,First_default_year,First_default_month,First_default_day,Difference
0,22487461,3.935514,-2.299065,5.26729,0.0,Tavern_district_3,7,2.5,-3,8,2.5,-3,5,36.0,Male,0,2015,8,29,2015,11,5,2014,5,1,-485
1,62494261,3.5,-2.0,5.0,2.0,Tavern_district_4,7,2.5,-3,14,3.5,-3,5,29.0,Female,1,2015,12,21,2016,8,26,2016,7,30,222
2,34822849,3.935514,-2.299065,5.26729,0.0,Tavern_district_6,7,2.5,-3,8,2.5,-3,5,56.0,Female,0,2015,11,11,2016,2,18,2014,5,1,-559
3,46893387,3.935514,-2.299065,5.26729,0.0,Tavern_district_2,13,2.5,-2,5,2.5,-3,5,27.0,Female,0,2016,3,22,2016,4,30,2014,5,1,-691
4,67128275,3.935514,-2.299065,5.26729,0.0,Tavern_district_4,39,2.5,-3,7,2.5,-3,5,37.0,Female,0,2016,7,21,2016,9,19,2014,5,1,-812


Let's write a function that transforms object columns into real/discrete (numeric) format.
Transform other numeric (but categorical) we won’t, since we plan to build compositions of trees as models

Let's calculate the averages for the training dataset because it's impossible to take target averages in the test

In [17]:
mean_gender = df_train.groupby('Gender')['Default'].mean()

def transform_object_cols(data, means=mean_gender):
    
    data['Gender'] = data['Gender'].map(mean_gender)
    
    dummy = pd.get_dummies(data.Region, drop_first=True)    
    data.drop('Region', axis=1, inplace=True)
    
    data[dummy.columns] = dummy
    
    return

Let's apply the function and make sure that everything is transformed exactly like this: as we planned

In [18]:
transform_object_cols(df_train)
transform_object_cols(df_test)

df_train.head()

Unnamed: 0,Deal_id,Secret_dwarf_info_1,Secret_dwarf_info_2,Secret_dwarf_info_3,Successful_deals_count,Tavern,Hashed_deal_detail_1,Hashed_deal_detail_2,Hashed_deal_detail_3,Hashed_deal_detail_4,Hashed_deal_detail_5,Hashed_deal_detail_6,Age,Gender,Default,First_deal_year,First_deal_month,First_deal_day,Deal_year,Deal_month,Deal_day,First_default_year,First_default_month,First_default_day,Difference,Tavern_district_1,Tavern_district_2,Tavern_district_3,Tavern_district_4,Tavern_district_5,Tavern_district_6,Tavern_district_7
0,22487461,3.935514,-2.299065,5.26729,0.0,7,2.5,-3,8,2.5,-3,5,36.0,0.168565,0,2015,8,29,2015,11,5,2014,5,1,-485,False,False,True,False,False,False,False
1,62494261,3.5,-2.0,5.0,2.0,7,2.5,-3,14,3.5,-3,5,29.0,0.101502,1,2015,12,21,2016,8,26,2016,7,30,222,False,False,False,True,False,False,False
2,34822849,3.935514,-2.299065,5.26729,0.0,7,2.5,-3,8,2.5,-3,5,56.0,0.101502,0,2015,11,11,2016,2,18,2014,5,1,-559,False,False,False,False,False,True,False
3,46893387,3.935514,-2.299065,5.26729,0.0,13,2.5,-2,5,2.5,-3,5,27.0,0.101502,0,2016,3,22,2016,4,30,2014,5,1,-691,False,True,False,False,False,False,False
4,67128275,3.935514,-2.299065,5.26729,0.0,39,2.5,-3,7,2.5,-3,5,37.0,0.101502,0,2016,7,21,2016,9,19,2014,5,1,-812,False,False,False,True,False,False,False


### Training model

In [19]:
### Let's separate targets and features

X_train = df_train.drop(['Default', 'Deal_id'], axis=1)
X_test = df_test.drop(['Deal_id'], axis=1)

Y_train = df_train['Default']

In [20]:
from sklearn.ensemble import RandomForestClassifier

max_test_score = 0

model = RandomForestClassifier(
    random_state=472
)

model.fit(X_train, Y_train)

Even such a simple model is enough to beat the baseline of the problem. RandomState is the only Random Forest parameter used for validation.

You can significantly improve the results, say, by adding more features and additional testing on the remaining hyperparameters + models.

### Loading data into csv of the required format

In [21]:
submission = df_test['Deal_id'].copy().to_frame()
submission['Prediction'] = model.predict_proba(X_test)[:, 1]

submission.head()

Unnamed: 0,Deal_id,Prediction
0,72875713,0.1
1,75825544,0.09
2,81809181,0.24
3,87083256,0.19
4,84651519,0.29


In [22]:
submission.to_csv('submission.csv', index=False)