In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## FIFA 이적료 예측시 추가해야 할 부분
#### 평가 기준 : (RMSE)
#### 특성이 많을 수록 평가 성능이 올라감 // polynomial 사용
#### 타깃값을 log로 변환해서 예측 후 RMSE로 평가할 때는 지수 변환 후 평가

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data_path = '/kaggle/input/fifadataset/'
fifa_train = pd.read_csv(data_path + 'FIFA_train.csv')
fifa_test = pd.read_csv(data_path + 'FIFA_test.csv')
submission = pd.read_csv(data_path + 'submission.csv')

In [None]:
all_data = pd.concat([fifa_train, fifa_test], ignore_index=True)

In [None]:
all_data['contract_until'] = all_data['contract_until'].map({'2018':2018, '2019':2019, '2020':2020, '2021':2021, '2022':2022, '2023':2023, '2024':2024, '2025':2025, '2026':2026, 'Jun 30, 2019': 2019.0630, 'Dec 31, 2018' : 2018.1231, 'May 31, 2019': 2019.0531, 'Jun 30, 2020': 2020.0630, 'Jan 31, 2019': 2019.0631, 'Jan 1, 2019': 2019.0101, 'May 31, 2020' : 2020.0531, 'Jan 12, 2019': 2019.0112, 'Dec 31, 2019' : 2019.1231, 'Jun 1, 2019' : 2019.0601})
all_data['contract_until'] # 계약기간 매핑

In [None]:
all_data.columns

In [None]:
mpl.rc('font', size = 10) # 폰트 크기 설정
mpl.rc('axes', titlesize=10) # 각 축의 제목 크기 설정

figure, axes = plt.subplots(nrows=7,ncols=2)
figure.set_size_inches(30,30)
plt.subplots_adjust(hspace=0.8, wspace=0.2)

sns.barplot(x='contract_until',y='value',data=all_data,ax=axes[0,0])
axes[0,0].set_title('fifa_train')
axes[0,0].tick_params(axis='x',labelrotation = 90)

all_data['contract_until'].value_counts().plot(kind='bar',ax=axes[0,1])
axes[0,1].set_title('all_data')
axes[0,1].tick_params(axis='x',labelrotation = 90);

sns.barplot(x='continent',y='value',data=fifa_train,ax=axes[1,0])

all_data['continent'].value_counts().plot(kind='bar',ax=axes[1,1])
axes[1,1].tick_params(axis='x',labelrotation = 0);

sns.barplot(x='position',y='value',data=fifa_train,ax=axes[2,0])

all_data['position'].value_counts().plot(kind='bar',ax=axes[2,1])

sns.barplot(x='reputation',y='value',data=fifa_train,ax=axes[3,0])

all_data['reputation'].value_counts().plot(kind='bar',ax=axes[3,1])

sns.barplot(x='stat_overall',y='value',data=fifa_train,ax=axes[4,0])
axes[4,0].tick_params(axis='x',labelrotation = 90);

all_data['stat_overall'].value_counts().plot(kind='bar',ax=axes[4,1])

sns.barplot(x='stat_potential',y='value',data=fifa_train,ax=axes[5,0])
axes[5,0].tick_params(axis='x',labelrotation = 90);

all_data['stat_potential'].value_counts().plot(kind='bar',ax=axes[5,1])

sns.barplot(x='stat_skill_moves',y='value',data=fifa_train,ax=axes[6,0])

all_data['stat_skill_moves'].value_counts().plot(kind='bar',ax=axes[6,1])


In [None]:
all_data

### 제거 할 특성 선택
#### id, name, age, continent, position, prefer_foot

In [None]:
drop_feature = ['id', 'name', 'age', 'continent', 'position', 'prefer_foot'] # 피처 제거

all_data = all_data.drop(drop_feature, axis=1)
all_data

In [None]:
X_train = all_data[~pd.isnull(all_data['value'])] # 타겟이 존재하는 데이터는 훈련 데이터
X_test = all_data[pd.isnull(all_data['value'])] # 타겟이 없는 데이터는 테스트 데이터
X_train = X_train.drop(['value'], axis = 1) # 타겟값 제거
X_test = X_test.drop(['value'], axis = 1) # 타겟값 제거

In [None]:
y = fifa_train['value']

### 다항생성

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(include_bias=False)
poly.fit(X_train)
X_train_p = poly.fit_transform(X_train)
print(X_train_p.shape)

In [None]:
X_test_p = poly.fit_transform(X_test)

### 평가식 작성

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred, convertExp=True):
    
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
    
    #RMSE 계산
    MSE = mean_squared_error(y_true, y_pred)
    output = np.sqrt(MSE)
    
    return output

### 모델 생성(랜덤 포레스트)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [None]:
rmse_scorer = metrics.make_scorer(rmse, greater_is_better=False)

In [None]:
rf_model = RandomForestRegressor()

rf_params = {'random_state':[42], 'n_estimators':[100,120,140,160,180,200,220,240,260,280,300,320,340,360,400,420,440,460,480,500,600,700,800,900,1000]}

gridsearch_rf_model = GridSearchCV(estimator=rf_model,
                                   param_grid=rf_params,
                                   scoring=rmse_scorer,
                                   cv=5)
log_y = np.log(y)
gridsearch_rf_model.fit(X_train_p, log_y)
print('최적 하이퍼파라미터 :', gridsearch_rf_model.best_params_)

In [None]:
# 예측
preds = gridsearch_rf_model.best_estimator_.predict(X_train_p)

# 평가
print(f'랜덤 포레스트 회귀 RMSLE 값 : {rmse(log_y, preds, True):.4f}')

### 평가 전 그래프 분포도 확인

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

rf_preds = gridsearch_rf_model.best_estimator_.predict(X_test_p)

figure, axes = plt.subplots(ncols=2)
figure.set_size_inches(10,4)

sns.histplot(y,bins=30, ax=axes[0])
axes[0].set_title('Train Data Distribution')

sns.histplot(np.exp(rf_preds), bins=30, ax=axes[1])
axes[1].set_title('Predicted Test Data Distribution');

In [None]:
# submission['value'] = np.exp(rf_preds)
# submission.to_csv('poly_submission01.csv',index=False)