# 線形モデル

## import

In [125]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import re
import pandas as pd

## 教師データ

In [126]:
datas = pd.read_csv("train_all.csv")
drop_list = [
    "id",
    "tv",
    "referee",
    "time",
    "home",
    "away",
    "home_01", "home_02", "home_03", "home_04", "home_05", "home_06", "home_07", "home_08", "home_09", "home_10", "home_11",
    "away_01", "away_02", "away_03", "away_04", "away_05", "away_06", "away_07", "away_08", "away_09", "away_10", "away_11",
    "address",
]
datas = datas.drop(drop_list, axis=1)

datas = datas.sort_values(by="y")
datas = datas[datas["y"] != 0]

# # datas.columns
datas.head(5)


Unnamed: 0,y,year,stage,match,gameday,stadium,home_score,away_score,weather,temperature,humidity,home_team,away_team,capa
1313,1104,2013,Ｊ２,第２２節第１日,07/03(水),ニンジニアスタジアム,1,1,雨,26.9,72%,愛媛ＦＣ,ロアッソ熊本,15576
506,1333,2012,Ｊ２,第１９節第１日,06/13(水),正田醤油スタジアム群馬,0,0,曇,19.4,70%,ザスパ草津,ロアッソ熊本,15135
696,1363,2012,Ｊ２,第３６節第１日,09/30(日),とりぎんバードスタジアム,2,0,雨,20.0,80%,ガイナーレ鳥取,ザスパ草津,16033
406,1447,2012,Ｊ２,第１０節第１日,04/27(金),正田醤油スタジアム群馬,0,2,曇,16.9,87%,ザスパ草津,ギラヴァンツ北九州,15135
1748,1450,2014,Ｊ２,第６節第１日,04/05(土),香川県立丸亀競技場,0,1,晴のち曇,15.5,57%,カマタマーレ讃岐,栃木ＳＣ,22338


### 欠損値の確認

In [127]:
missing_values = datas.isnull().sum()
missing_values

y              0
year           0
stage          0
match          0
gameday        0
stadium        0
home_score     0
away_score     0
weather        0
temperature    0
humidity       0
home_team      0
away_team      0
capa           0
dtype: int64

## 前処理

### 日付関係

In [128]:
datas["MONTH"] = datas["gameday"].apply(lambda x : x[0:2])
datas["DAY"] = datas["gameday"].apply(lambda x : x[3:5])
datas["WEEK"] = datas["gameday"].apply(lambda x : x[x.find("(")+1:x.find("(")+2])
datas = pd.get_dummies(datas, columns=["WEEK"], drop_first=True)
datas = datas.drop("gameday", axis=1)

### 試合？関係

In [129]:
datas["stage"] = datas["stage"].apply(lambda x : x[1])
datas["match_num"] = datas["match"].apply(lambda x : int(re.findall(r'\d+', x)[0]))
datas["match_day"] = datas["match"].apply(lambda x : int(re.findall(r'\d+', x)[1]))
datas = datas.drop("match", axis=1)

### チーム関係

In [130]:
datas = pd.get_dummies(datas, columns=["home_team"], drop_first=True)
datas = pd.get_dummies(datas, columns=["away_team"], drop_first=True)

In [131]:
datas.head(1)
datas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1952 entries, 1313 to 1059
Columns: 104 entries, y to away_team_Ｖ・ファーレン長崎
dtypes: bool(90), float64(1), int64(7), object(6)
memory usage: 400.3+ KB


## 説明変数

In [132]:
# 説明変数を設定
not_need_cols = [
	"stadium",
	"weather",
	"humidity",
	"y",
]

In [133]:
X = datas.drop(not_need_cols, axis=1)
y = datas["y"]

In [134]:
X.head()

Unnamed: 0,year,stage,home_score,away_score,temperature,capa,MONTH,DAY,WEEK_日,WEEK_月,...,away_team_横浜ＦＣ,away_team_水戸ホーリーホック,away_team_浦和レッズ,away_team_清水エスパルス,away_team_湘南ベルマーレ,away_team_鹿島アントラーズ,away_team_ＦＣ岐阜,away_team_ＦＣ東京,away_team_ＦＣ町田ゼルビア,away_team_Ｖ・ファーレン長崎
1313,2013,２,1,1,26.9,15576,7,3,False,False,...,False,False,False,False,False,False,False,False,False,False
506,2012,２,0,0,19.4,15135,6,13,False,False,...,False,False,False,False,False,False,False,False,False,False
696,2012,２,2,0,20.0,16033,9,30,True,False,...,False,False,False,False,False,False,False,False,False,False
406,2012,２,0,2,16.9,15135,4,27,False,False,...,False,False,False,False,False,False,False,False,False,False
1748,2014,２,0,1,15.5,22338,4,5,False,False,...,False,False,False,False,False,False,False,False,False,False


## テスト用データにのみ教師データがあるのでテスト用データを訓練データとテストデータに分ける

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## 学習とスコアの表示

### Ridge回帰

In [136]:
ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

Training set score: 0.84
Test set score: 0.86


In [137]:
ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))

Training set score: 0.82
Test set score: 0.84


In [138]:
ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge01.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge01.score(X_test, y_test)))

Training set score: 0.85
Test set score: 0.87


### 通常最小二乗法

In [139]:
lr = LinearRegression().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

Training set score: 0.85
Test set score: 0.87
