In [1]:
import random
import os

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

# 변수고정

In [2]:
class CFG:
    user_seed = 42
    target = 'Prospect'

# Seed 고정

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED']=str(seed)
    np.random.seed(seed)
    
seed_everything(CFG.user_seed)

# Load data

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [6]:
train.isnull().sum().sum()

0

In [7]:
train.head()

Unnamed: 0,ID,Age,Height,Weight,Position,PreferredFoot,AttackingWorkRate,DefensiveWorkRate,PaceTotal,ShootingTotal,...,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating,Prospect
0,TRAIN_0000,21,182,82,CB,Right,Medium,Medium,4.6377,2.1053,...,4.9315,4.8649,6.1972,6.0,6.1972,6.4286,6.5278,6.4286,0.875,0
1,TRAIN_0001,21,160,75,ST,Left,High,Medium,7.3913,7.3684,...,5.8904,6.4865,5.2113,4.4286,5.2113,5.0,4.1667,5.0,1.25,0
2,TRAIN_0002,18,165,60,RB,Right,Medium,Medium,6.2319,3.2895,...,4.9315,5.2703,5.7746,5.1429,5.7746,5.8571,5.0,5.8571,0.75,0
3,TRAIN_0003,20,190,72,CB,Right,Medium,Medium,4.7826,1.0526,...,2.3288,2.2973,4.0845,3.7143,4.0845,4.5714,4.5833,4.5714,0.875,0
4,TRAIN_0004,21,181,77,ST,Right,High,Medium,8.2609,7.3684,...,6.3014,7.5676,5.2113,4.2857,5.2113,4.7143,3.3333,4.7143,1.125,0


In [8]:
test.head()

Unnamed: 0,ID,Age,Height,Weight,Position,PreferredFoot,AttackingWorkRate,DefensiveWorkRate,PaceTotal,ShootingTotal,...,LMRating,CMRating,RMRating,LWBRating,CDMRating,RWBRating,LBRating,CBRating,RBRating,GKRating
0,TEST_0000,17,184,75,GK,Right,Medium,Medium,3.7681,5.3947,...,0.1351,0.5479,0.1351,0.2817,0.4286,0.2817,0.2857,0.2778,0.2857,5.5
1,TEST_0001,19,188,80,ST,Right,High,Medium,4.9275,4.8684,...,4.1892,3.4247,4.1892,3.0986,2.7143,3.0986,3.0,3.0556,3.0,0.75
2,TEST_0002,17,173,68,CAM,Right,High,High,6.9565,6.3158,...,8.3784,8.6301,8.3784,8.3099,8.2857,8.3099,8.1429,7.0833,8.1429,1.125
3,TEST_0003,20,178,69,CAM,Right,Medium,Medium,6.8116,6.4474,...,7.1622,6.7123,7.1622,5.6338,5.1429,5.6338,5.2857,4.1667,5.2857,1.375
4,TEST_0004,19,176,72,CAM,Left,High,Low,5.5072,4.7368,...,5.9459,5.4795,5.9459,5.0704,4.7143,5.0704,4.8571,4.0278,4.8571,0.75


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3019 entries, 0 to 3018
Data columns (total 66 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 3019 non-null   object 
 1   Age                3019 non-null   int64  
 2   Height             3019 non-null   int64  
 3   Weight             3019 non-null   int64  
 4   Position           3019 non-null   object 
 5   PreferredFoot      3019 non-null   object 
 6   AttackingWorkRate  3019 non-null   object 
 7   DefensiveWorkRate  3019 non-null   object 
 8   PaceTotal          3019 non-null   float64
 9   ShootingTotal      3019 non-null   float64
 10  PassingTotal       3019 non-null   float64
 11  DribblingTotal     3019 non-null   float64
 12  DefendingTotal     3019 non-null   float64
 13  PhysicalityTotal   3019 non-null   float64
 14  Crossing           3019 non-null   float64
 15  Finishing          3019 non-null   float64
 16  HeadingAccuracy    3019 

# Data preprocessing

## Label Encoding
- qualitative to quantitative

In [11]:
qual_col = ['Position', 'PreferredFoot', 'AttackingWorkRate', 'DefensiveWorkRate']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test[i] = le.transform(test[i])
    
print('Done.')

Done.


# X, y split

In [12]:
X = train.drop(columns=['ID',CFG.target], axis=1)
y = train[CFG.target]

# Classification model fit

In [13]:
model = RandomForestClassifier(random_state=CFG.user_seed)

model.fit(X, y)

In [14]:
X_test = test.drop('ID', axis=1)

pred = model.predict(X_test)

# Submit

In [15]:
submit = pd.read_csv('data/sample_submission.csv')
submit[CFG.target] = pred

In [16]:
submit

Unnamed: 0,ID,Prospect
0,TEST_0000,1
1,TEST_0001,1
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,1
...,...,...
1621,TEST_1621,1
1622,TEST_1622,0
1623,TEST_1623,0
1624,TEST_1624,1


In [17]:
submit.to_csv('data/baseline.csv', index=False)