In [1]:
import sys
sys.version

'3.6.7 |Anaconda, Inc.| (default, Oct 28 2018, 19:44:12) [MSC v.1915 64 bit (AMD64)]'

In [100]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm_notebook

In [67]:
data = pd.read_csv('./data/train_V2.csv')
train_df, test_df = train_test_split(data, train_size = 0.7)

In [68]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3112876 entries, 3300836 to 323582
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 712.5+ MB


In [69]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1334090 entries, 534022 to 40222
Data columns (total 29 columns):
Id                 1334090 non-null object
groupId            1334090 non-null object
matchId            1334090 non-null object
assists            1334090 non-null int64
boosts             1334090 non-null int64
damageDealt        1334090 non-null float64
DBNOs              1334090 non-null int64
headshotKills      1334090 non-null int64
heals              1334090 non-null int64
killPlace          1334090 non-null int64
killPoints         1334090 non-null int64
kills              1334090 non-null int64
killStreaks        1334090 non-null int64
longestKill        1334090 non-null float64
matchDuration      1334090 non-null int64
matchType          1334090 non-null object
maxPlace           1334090 non-null int64
numGroups          1334090 non-null int64
rankPoints         1334090 non-null int64
revives            1334090 non-null int64
rideDistance       1334090 non-null 

In [70]:
# 회귀분석을 하기 위해 문자열을 범주값으로 변환
matchTyp = {'squad-fpp': 0, 'duo': 1, 'solo-fpp': 2, 'squad': 3, 'duo-fpp': 4, 'solo': 5,
       'normal-squad-fpp': 6, 'crashfpp': 7, 'flaretpp': 8, 'normal-solo-fpp': 9,
       'flarefpp': 10, 'normal-duo-fpp': 11, 'normal-duo': 12, 'normal-squad': 13,
       'crashtpp': 14, 'normal-solo': 15 }

train_df['matchType'] = train_df['matchType'].replace(matchTyp)
test_df['matchType'] = test_df['matchType'].replace(matchTyp)

In [74]:
# null값 확인 및 
# inplace=False로 하면 기존 혹은 새로운 변수에 할당해야 하고, inplace = True 하면 해당변수에 적용됨
# na가 포함된 행을 제거하는 것은 데이터 소실이 크기 때문에 inplace=False가 default
train_df.dropna(inplace = True)
print(train_df.isnull().any().any())
test_df.dropna(inplace = True)
print(test_df.isnull().any().any())

False
False


In [None]:
# drop함수와 dropna 함수는 데이터 소실 우려가 있기 때문에 할당하거나, inplace=True 매개변수를 가져야 정정되도록 default설정
noId_train_df = train_df.drop(['Id','groupId','matchId'], axis = 1)

In [92]:
noId_test_df = test_df.drop(['Id','groupId','matchId'], axis = 1)

In [16]:
# train_df.to_csv('./data/train_no_Id_groupId_machId.csv',index=False,header=False)

In [17]:
# test_df.to_csv('./data/test_no_Id_groupId_machId.csv',index=False,header=False)

### 문제
> - 예측변수인 winPlacePerc는 0~1 사이의 값인데 반해, x변수값 중에서는 단위가 큰 값이 존재하므로, 단위의 normalization or MinMaxSaler 함수를 적용해야 함

### 데이터 전 처리 없이 multi linear regression
> - min_max_scaler 를 사용해서 변수 normal 화
> - matchtype 은 0~16 값으로 범주화

In [101]:
def MinMaxScaler(data):
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)
    # noise term prevents the zero division
    return numerator / (denominator + 1e-7)

train_np = np.array(noId_train_df)
test_np = np.array(noId_test_df)

# matchtype은 12번째 값
matchType_index = list(noId_train_df.columns).index('matchType')

x_train = train_np[:,0:-1]  # id, groupId, matchid 제외
x_train[:,0:matchType_index] = MinMaxScaler(x_train)[:,0:matchType_index] # matchtype 제외시키기 위해
x_train[:,matchType_index+1:] = MinMaxScaler(x_train)[:,matchType_index+1:] # matchtype 제외시키기 위해
y_train = train_np[:, [-1]]
print('pass')

x_test = test_np[:,:-1] # id, groupId, matchid 제외
x_test[:,0:16] = MinMaxScaler(x_test)[:,0:16] # matchtype 제외시키기 위해
x_test[:,17:] = MinMaxScaler(x_test)[:,17:] # matchtype 제외시키기 위해
y_test = test_np[:,[-1]]
print('pass')

# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 25]) # 총 29개 칼럼에서 3개 빼면 26, y값 까지 빼면 25
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([25, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis(model)
hypothesis = tf.matmul(X, W) + b

# Simplified cost/loss function
cost = tf.reduce_mean(tf.square(hypothesis - Y))

# Minimize
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
train = optimizer.minimize(cost)

# Launch the graph in a session.
with tf.Session() as sess:
    # Initializes global variables in the graph.
    sess.run(tf.global_variables_initializer())

    for step in tqdm_notebook(range(5001)):
        cost_val, hy_val, _ = sess.run(
            [cost, hypothesis, train], feed_dict={X: x_train, Y: y_train})
        if step % 100 == 0:
            print(step, "Cost: ", cost_val, "\nPrediction:\n", hy_val)
    Weight = sess.run(W)
    print("W\t" , Weight)
    y_predict = sess.run(hypothesis, feed_dict={X:x_test})
    print("Predict\t", y_predict)

pass
pass


HBox(children=(IntProgress(value=0, max=5001), HTML(value='')))

0 Cost:  3.5584068 
Prediction:
 [[ 2.7079744 ]
 [-0.33543387]
 [-0.56882894]
 ...
 [-0.00630715]
 [ 1.9741169 ]
 [ 1.6060387 ]]
100 Cost:  0.32071665 
Prediction:
 [[ 0.8507526 ]
 [ 0.1836473 ]
 [-0.10411993]
 ...
 [ 0.3805655 ]
 [ 0.10063615]
 [ 0.53817827]]
200 Cost:  0.249158 
Prediction:
 [[0.83845574]
 [0.3389371 ]
 [0.05544251]
 ...
 [0.48381275]
 [0.14193141]
 [0.6446527 ]]
300 Cost:  0.20526671 
Prediction:
 [[0.82227355]
 [0.37044436]
 [0.10172993]
 ...
 [0.48594207]
 [0.18852133]
 [0.7017839 ]]
400 Cost:  0.17222376 
Prediction:
 [[0.8076446 ]
 [0.38042545]
 [0.12632072]
 ...
 [0.4727304 ]
 [0.23096752]
 [0.74456847]]
500 Cost:  0.14709465 
Prediction:
 [[0.7950967 ]
 [0.38635284]
 [0.14522737]
 ...
 [0.4586394 ]
 [0.26831257]
 [0.7799418 ]]
600 Cost:  0.12795874 
Prediction:
 [[0.7844067 ]
 [0.39122003]
 [0.16146559]
 ...
 [0.44593513]
 [0.30095595]
 [0.8098706 ]]
700 Cost:  0.11336343 
Prediction:
 [[0.7752832 ]
 [0.3955505 ]
 [0.17577738]
 ...
 [0.4348113 ]
 [0.3294518 ]


In [103]:
test_id = pd.DataFrame(test_df['Id'])
test_winPlacePerc = pd.DataFrame(test_df['winPlacePerc'])
y_predict_df = pd.DataFrame(y_predict,index = test_id.index, columns=['predict'])
diff = pd.DataFrame(test_df['winPlacePerc']-y_predict_df['predict'],
                   columns=['diff'])

mae = sum(abs(diff['diff']))/diff['diff'].count() # Mean absolute error

result = pd.DataFrame({'Id':test_id['Id'],'winPlacePerc':test_winPlacePerc['winPlacePerc'], 
                       'predict':y_predict_df['predict'],'diff':diff['diff'],'MAE':mae})
result.to_csv('./result/Multi_Linear_Regression_all_inputs_with_minmaxscaler.csv',sep=',', encoding='utf-8')
print(result.head())
print('Mae\t : ', mae)

                     Id  winPlacePerc   predict      diff       MAE
534022   78ae25b8983d41        0.3118  0.286951  0.024849  0.396951
1948751  cf91ae2fbee796        0.2391  0.311331 -0.072231  0.396951
1252082  dedb9cae765103        0.3448  0.433663 -0.088863  0.396951
4136213  62d734d663136d        0.1702  0.227381 -0.057181  0.396951
4288604  be92c16ed971aa        0.7500  0.802435 -0.052435  0.396951
Mae	 :  0.3969511155109651


### 회귀 계수를 살펴본 결과
 [ 0.1077079 ]	walkDistance      <br>
 [ 0.15412308]	maxPlace          <br>
 [ 0.30721468]	teamKills         <br>
 [ 0.5035227 ]	boosts            <br>
 [ 0.51933736]	roadKills         <br>
 [ 0.5360892 ]	kills             <br>
 [ 0.93974686]	heals             <br>
 [ 1.2828442 ]	longestKill       <br>
 [ 1.358886  ]	vehicleDestroys   <br>
 [ 1.4761748 ]	winPoints         <br>
 [ 1.4915845 ]	damageDealt       <br>
 [ 1.6038594 ]	headshotKills     <br>
 [ 1.8143547 ]	swimDistance      <br>
 [-0.01062036]	matchType         <br>
 [-0.11575591]	numGroups         <br>
 [-0.2275904 ]	matchDuration     <br>
 [-0.4334233 ]	killStreaks       <br>
 [-0.52530503]	weaponsAcquired   <br>
 [-0.55356896]	rankPoints        <br>
 [-0.5543242 ]	rideDistance      <br>
 [-0.5705474 ]	killPlace         <br>
 [-0.76339287]	DBNOs             <br>
 [-1.4283952 ]	revives           <br>
 [-2.1466136 ]	killPoints        <br>
 [ 0.3377959 ]	assists           <br>


### 의미가 없어보인다. 수영오래할 수록 승률이 가장 높아진다 말이 안된다...