# 準備

## import

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import re
import pandas as pd

## データの読み込み

### 削除するカラム

In [23]:
drop_list = [
    "tv",
    "referee",
    "time",
    "home",
    "away",
    "home_01", "home_02", "home_03", "home_04", "home_05", "home_06", "home_07", "home_08", "home_09", "home_10", "home_11",
    "away_01", "away_02", "away_03", "away_04", "away_05", "away_06", "away_07", "away_08", "away_09", "away_10", "away_11",
    "address",
    "humidity",
]

### データ

In [24]:
datas = pd.read_csv("./input/train_all.csv")


datas = datas.drop(drop_list, axis=1)

datas = datas.sort_values(by="y")

datas.head(5)


Unnamed: 0,id,y,year,stage,match,gameday,stadium,home_score,away_score,weather,temperature,home_team,away_team,capa
1567,15699,0,2014,Ｊ１,第４節第１日,03/23(日),埼玉スタジアム２００２,1,1,晴,16.2,浦和レッズ,清水エスパルス,63700
1313,15381,1104,2013,Ｊ２,第２２節第１日,07/03(水),ニンジニアスタジアム,1,1,雨,26.9,愛媛ＦＣ,ロアッソ熊本,15576
506,14500,1333,2012,Ｊ２,第１９節第１日,06/13(水),正田醤油スタジアム群馬,0,0,曇,19.4,ザスパ草津,ロアッソ熊本,15135
696,14690,1363,2012,Ｊ２,第３６節第１日,09/30(日),とりぎんバードスタジアム,2,0,雨,20.0,ガイナーレ鳥取,ザスパ草津,16033
406,14400,1447,2012,Ｊ２,第１０節第１日,04/27(金),正田醤油スタジアム群馬,0,2,曇,16.9,ザスパ草津,ギラヴァンツ北九州,15135


## 外れ値の除外

In [25]:
datas = datas[datas["id"] != 15127]	# yの値が大きく外れたデータ
datas = datas[datas["id"] != 15699]	# yの値が0のデータ
datas = datas[datas["id"] != 14071]	# 木の中でyの値が外れたデータ
datas = datas[datas["id"] != 14911]	# 金の中でyの値が外れたデータ

## 前処理

### 日付関係

In [26]:
datas["MONTH"] = datas["gameday"].apply(lambda x : x[0:2])
datas["DAY"] = datas["gameday"].apply(lambda x : x[3:5])
datas["WEEK"] = datas["gameday"].apply(lambda x : x[x.find("(")+1:x.find("(")+2])
datas = pd.get_dummies(datas, columns=["WEEK"], drop_first=True)
datas = datas.drop("gameday", axis=1)

### 試合？関係

In [27]:
datas["stage"] = datas["stage"].apply(lambda x : x[1])
datas["match_num"] = datas["match"].apply(lambda x : int(re.findall(r'\d+', x)[0]))
datas["match_day"] = datas["match"].apply(lambda x : int(re.findall(r'\d+', x)[1]))
datas = datas.drop("match", axis=1)

### チーム関係

In [28]:
datas = pd.get_dummies(datas, columns=["home_team"], drop_first=True)
datas = pd.get_dummies(datas, columns=["away_team"], drop_first=True)

### スタジアム関係

In [29]:
datas = pd.get_dummies(datas, columns=["stadium"], drop_first=True)


## 説明変数

In [30]:
# 説明変数を設定
not_need_cols = [
	"id",
	"weather",
	"y",
    "MONTH",
    "DAY",
]

In [31]:
X = datas.drop(not_need_cols, axis=1)
y = datas["y"]

In [32]:
X.head()

Unnamed: 0,year,stage,home_score,away_score,temperature,capa,WEEK_日,WEEK_月,WEEK_木,WEEK_水,...,stadium_長崎市総合運動公園かきどまり陸上競技場,stadium_長崎県立総合運動公園陸上競技場,stadium_香川県立丸亀競技場,stadium_駒沢オリンピック公園総合運動場陸上競技場,stadium_鳴門・大塚スポーツパーク　ポカリスエットスタジアム,stadium_鹿児島県立鴨池陸上競技場,stadium_ＩＡＩスタジアム日本平,stadium_ＮＡＣＫ５スタジアム大宮,stadium_ＮＤソフトスタジアム山形,stadium_Ｓｈｏｎａｎ　ＢＭＷスタジアム平塚
1313,2013,２,1,1,26.9,15576,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
506,2012,２,0,0,19.4,15135,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
696,2012,２,2,0,20.0,16033,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
406,2012,２,0,2,16.9,15135,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1748,2014,２,0,1,15.5,22338,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


# 訓練・テスト

## 訓練データとテストデータに分ける

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## 学習とスコアの表示

### 通常最小二乗法

### Ridge回帰

In [34]:
ridge100 = Ridge(alpha=100).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge100.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge100.score(X_test, y_test)))

Training set score: 0.75
Test set score: 0.77


In [35]:
ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))

Training set score: 0.85
Test set score: 0.85


In [36]:
ridge1 = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge1.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge1.score(X_test, y_test)))

Training set score: 0.86
Test set score: 0.85


In [37]:
ridge01 = Ridge(alpha=0.1).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge01.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge01.score(X_test, y_test)))

Training set score: 0.87
Test set score: 0.85


In [38]:
ridge001 = Ridge(alpha=0.01).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge001.score(X_test, y_test)))

Training set score: 0.87
Test set score: 0.84


In [39]:
lr = LinearRegression().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

Training set score: 0.87
Test set score: 0.84


### Lasso

In [40]:
lasso1 = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso1.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso1.score(X_test, y_test)))

Training set score: 0.87
Test set score: 0.85


  model = cd_fast.enet_coordinate_descent(


In [41]:
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso1.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso1.score(X_test, y_test)))

Training set score: 0.87
Test set score: 0.85


  model = cd_fast.enet_coordinate_descent(


In [42]:
lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso1.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso1.score(X_test, y_test)))

Training set score: 0.87
Test set score: 0.85


  model = cd_fast.enet_coordinate_descent(
