## 通し課題：あるクラウドファンデイングが成功するか(state)を事前に予測するモデルを構築する<BR>（データソース：kaggle / ks-projects-201801）

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.linear_model import SGDClassifier # ロジスティック回帰
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix ,precision_recall_fscore_support # モデルの評価、混同行列
from sklearn.model_selection import train_test_split # ホールドアウト法
from sklearn.model_selection import KFold # 交差検証法
from sklearn.preprocessing import StandardScaler, MinMaxScaler # 標準化
import nltk # 形態素解析
from nltk.corpus import stopwords
from sklearn.svm import SVC # SVM
from sklearn.model_selection import GridSearchCV # グリッドサーチ

In [2]:
# 全列を表示させる
# print(pd.get_option("display.max_columns"))
pd.set_option('display.max_columns', 100)
# print(pd.get_option("display.max_rows"))
pd.set_option('display.max_rows', 200)

### データの読み込み

In [3]:
df = pd.read_csv("../ks-projects-201801.csv")

display(df.shape)
# display(df.dtypes)
display(df.info()) # "name"と"usd pledged"に欠損値あり
display(df.head())

# 列の意味
# goal : Goal amount in project currency
# pledged : Pledged amount in the project currency
# usd_pledged: conversion in US dollars of the pledged column (conversion done by kickstarter) 米ドル換算 "not reliable enough"
# usd pledge real: conversion in US dollars of the pledged column (conversion from Fixer.io API) 為替レート変換API使用
# backers : "number of people who have pledged money to the project"

(378661, 15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


None

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [4]:
# ローンチからデッドライン迄の期間("days")を算出
df["launched"] = pd.to_datetime(df["launched"])
df["launched"] = df["launched"].dt.date # df[列名]で列名を指定すると、その列はpandas.Seriesとして取得される
df["deadline"] = pd.to_datetime(df["deadline"])
df["deadline"] = df["deadline"].dt.date
df["days"] = (df["deadline"] - df["launched"]).dt.days
# df["days"] = (df["deadline"] - df["launched"]) / datetime.timedelta(days=1)

# "state"が"canceled","undefined","live","suspended"のデータを除外 
#条件にマッチしたIndexを取得
drop_index = df.index[(df["state"] == "canceled") | (df["state"] == "undefined") | (df["state"] == "live") | (df["state"] == "suspended") | (df["name"].isnull())]
# display(drop_index)
#条件にマッチしたIndexを削除
df = df.drop(drop_index)
# display(df["state"].value_counts())
# print(df["state"] == "successful")

df.loc[df["state"]=="successful","state"] = 1
df.loc[df["state"]=="failed","state"] = 0

## 余力があれば外れ値を除く（"country"の"N,0"","launched"の1970年代など）

### nameに基づく説明変数1

In [5]:
# "successful"となったデータを抽出
df_success = df[df["state"] == 1]
df_success.head()

# "name"を纏める
word = ' '.join(df_success["name"])
# print(word[0:500])

# tokenize
tokens = nltk.word_tokenize(word)
# print(tokens_l[0:500])
tokens_l = [w.lower() for w in tokens]
text = nltk.Text(tokens_l)

# stopwords設定
stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in text if not w in stop_words]
filtered_sentence = []
for w in text:
    if w not in stop_words:
        filtered_sentence.append(w)

pos = nltk.pos_tag(filtered_sentence)

# 名詞だけ抽出
only_nn = [x for (x,y) in pos if y in ('NN')]
freq = nltk.FreqDist(only_nn)

In [6]:
top = freq.most_common(100)
labels, values = zip(*top)
print(labels)

# nameがtop100を含んでいたら"name_top"を1とする
pattern = '|'.join(labels)
# print(pattern)
df["name_top"] = df["name"].str.contains(pattern)
df["name_top"] = df["name_top"] * 1
# df.head(50)

('album', 'film', 'project', 'book', 'game', 'music', 'debut', 'art', 'world', 'help', 'record', 'cd', 'series', 'life', 'video', 'ep', 'story', 'studio', 'edition', 'dance', 'tour', 'show', 'city', 'man', 'card', 'collection', 'time', 'festival', 'feature', 'release', 'season', 'magazine', 'adventure', 'space', 'home', 'day', 'love', 'issue', 'food', 'volume', 'playing', 'movie', 'length', 'board', 'company', 'novel', 'fund', 'comedy', 'rpg', 'horror', 'play', 'vinyl', 'summer', 'wallet', 'community', 'design', 'production', 'night', 'coffee', 'part', 'journey', 'watch', 'way', 'band', 'road', 'house', 'dream', 'rock', 'exhibition', 'school', 'dice', 'year', 'heart', 'enamel', 'theatre', 'print', 'photography', 'let', 'documentary', 'calendar', 'war', 'pin', 'fantasy', 'support', 'system', 'box', 'history', 'party', 'family', 'fringe', 'solo', 'anthology', 'artist', 'fashion', 'campaign', 'york', 'dog', 'farm', 'power', 'performance')


### nameに基づく説明変数2

In [7]:
df["name_len"]=df["name"].str.len()

### カテゴリカルデータをダミー変数化

In [8]:
df_dummy = pd.get_dummies(df[["category","currency"]]) # SVMが重いため"main_category","country"を除いた
df2 = pd.concat([df,df_dummy],axis=1)
df2 = df2.drop(["ID","name","backers","main_category","category","country","currency","deadline","launched","usd_pledged_real","pledged","usd pledged","goal"],axis=1)
df2.info()
display(df2.head())
display(df2.describe())
# display(df2.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 331672 entries, 0 to 378660
Columns: 178 entries, state to currency_USD
dtypes: float64(1), int32(1), int64(3), uint8(173)
memory usage: 68.6 MB


Unnamed: 0,state,usd_goal_real,days,name_top,name_len,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,category_Art,category_Art Books,category_Audio,category_Bacon,category_Blues,category_Calendars,category_Camera Equipment,category_Candles,category_Ceramics,category_Children's Books,category_Childrenswear,category_Chiptune,category_Civic Design,category_Classical Music,category_Comedy,category_Comic Books,category_Comics,category_Community Gardens,category_Conceptual Art,category_Cookbooks,category_Country & Folk,category_Couture,category_Crafts,category_Crochet,category_DIY,category_DIY Electronics,category_Dance,category_Design,category_Digital Art,category_Documentary,category_Drama,category_Drinks,category_Electronic Music,category_Embroidery,category_Events,...,category_Restaurants,category_Robots,category_Rock,category_Romance,category_Science Fiction,category_Sculpture,category_Shorts,category_Small Batch,category_Software,category_Sound,category_Space Exploration,category_Spaces,category_Stationery,category_Tabletop Games,category_Taxidermy,category_Technology,category_Television,category_Textiles,category_Theater,category_Thrillers,category_Translations,category_Typography,category_Vegan,category_Video,category_Video Art,category_Video Games,category_Wearables,category_Weaving,category_Web,category_Webcomics,category_Webseries,category_Woodworking,category_Workshops,category_World Music,category_Young Adult,category_Zines,currency_AUD,currency_CAD,currency_CHF,currency_DKK,currency_EUR,currency_GBP,currency_HKD,currency_JPY,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD
0,0,1533.95,59,0,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,30000.0,60,1,45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,45000.0,45,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,5000.0,30,1,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,1,50000.0,35,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


Unnamed: 0,state,usd_goal_real,days,name_top,name_len,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,category_Art,category_Art Books,category_Audio,category_Bacon,category_Blues,category_Calendars,category_Camera Equipment,category_Candles,category_Ceramics,category_Children's Books,category_Childrenswear,category_Chiptune,category_Civic Design,category_Classical Music,category_Comedy,category_Comic Books,category_Comics,category_Community Gardens,category_Conceptual Art,category_Cookbooks,category_Country & Folk,category_Couture,category_Crafts,category_Crochet,category_DIY,category_DIY Electronics,category_Dance,category_Design,category_Digital Art,category_Documentary,category_Drama,category_Drinks,category_Electronic Music,category_Embroidery,category_Events,...,category_Restaurants,category_Robots,category_Rock,category_Romance,category_Science Fiction,category_Sculpture,category_Shorts,category_Small Batch,category_Software,category_Sound,category_Space Exploration,category_Spaces,category_Stationery,category_Tabletop Games,category_Taxidermy,category_Technology,category_Television,category_Textiles,category_Theater,category_Thrillers,category_Translations,category_Typography,category_Vegan,category_Video,category_Video Art,category_Video Games,category_Wearables,category_Weaving,category_Web,category_Webcomics,category_Webseries,category_Woodworking,category_Workshops,category_World Music,category_Young Adult,category_Zines,currency_AUD,currency_CAD,currency_CHF,currency_DKK,currency_EUR,currency_GBP,currency_HKD,currency_JPY,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD
count,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,...,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0,331672.0
mean,0.403881,41509.66,33.954862,0.225931,34.220335,0.001713,0.002343,0.008261,0.001872,0.00069,0.006672,0.002222,0.018591,0.01594,0.001936,0.02257,0.007444,0.001094,0.000546,0.000706,0.000874,0.001055,0.00114,0.000829,0.018687,0.00126,9.9e-05,0.000802,0.007357,0.006235,0.007516,0.013812,0.000817,0.002804,0.001372,0.012702,0.000684,0.012518,0.000455,0.003066,0.002403,0.006687,0.010694,0.003531,0.043787,0.005768,0.006359,0.005738,0.000302,0.002132,...,0.007595,0.001495,0.018832,0.000491,0.001972,0.004996,0.034353,0.004887,0.007758,0.001661,0.000853,0.002276,0.000573,0.035408,3e-05,0.017264,0.002614,0.000754,0.020152,0.001927,0.000413,0.000292,0.001592,0.001161,0.000525,0.028121,0.002916,0.000247,0.012992,0.001806,0.015521,0.00319,0.000446,0.005849,0.002141,0.00107,0.019962,0.037311,0.001966,0.002801,0.04335,0.088868,0.001438,6.9e-05,0.004254,0.001761,0.003841,0.004553,0.001369,0.788457
std,0.490675,1108935.0,12.713282,0.418195,15.961502,0.041347,0.048345,0.090515,0.04323,0.026267,0.081411,0.047087,0.135074,0.125245,0.043953,0.14853,0.085958,0.033064,0.023354,0.026552,0.029557,0.032468,0.03374,0.028783,0.135418,0.035478,0.009974,0.028308,0.085455,0.078716,0.086371,0.11671,0.028573,0.052878,0.037013,0.111987,0.026152,0.111183,0.021332,0.055289,0.048961,0.081502,0.102859,0.059314,0.204622,0.075726,0.079488,0.075529,0.017361,0.04612,...,0.086817,0.038642,0.135931,0.022163,0.044362,0.070505,0.182135,0.069739,0.087735,0.040725,0.029198,0.047657,0.023928,0.18481,0.005491,0.130254,0.051061,0.027444,0.140522,0.043851,0.02032,0.017099,0.039867,0.034051,0.022898,0.165319,0.053917,0.015722,0.113239,0.042459,0.123614,0.056389,0.021119,0.076256,0.046218,0.032699,0.139872,0.189523,0.044294,0.05285,0.203644,0.284553,0.037896,0.008327,0.065085,0.041925,0.061858,0.06732,0.036972,0.408403
min,0.0,0.01,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2000.0,30.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,5000.0,30.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,15000.0,36.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,166361400.0,92.0,1.0,85.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### データの標準化

In [9]:
# "country"を説明変数から除外しているので無相関化、白色化は実施せず
stdsc = StandardScaler()
df2["usd_goal_real"] = stdsc.fit_transform(df2[["usd_goal_real"]].values)
df2["days"] = stdsc.fit_transform(df2[["days"]].values)
df2["name_len"] = stdsc.fit_transform(df2[["name_len"]].values)
display(df2.head())



Unnamed: 0,state,usd_goal_real,days,name_top,name_len,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,category_Art,category_Art Books,category_Audio,category_Bacon,category_Blues,category_Calendars,category_Camera Equipment,category_Candles,category_Ceramics,category_Children's Books,category_Childrenswear,category_Chiptune,category_Civic Design,category_Classical Music,category_Comedy,category_Comic Books,category_Comics,category_Community Gardens,category_Conceptual Art,category_Cookbooks,category_Country & Folk,category_Couture,category_Crafts,category_Crochet,category_DIY,category_DIY Electronics,category_Dance,category_Design,category_Digital Art,category_Documentary,category_Drama,category_Drinks,category_Electronic Music,category_Embroidery,category_Events,...,category_Restaurants,category_Robots,category_Rock,category_Romance,category_Science Fiction,category_Sculpture,category_Shorts,category_Small Batch,category_Software,category_Sound,category_Space Exploration,category_Spaces,category_Stationery,category_Tabletop Games,category_Taxidermy,category_Technology,category_Television,category_Textiles,category_Theater,category_Thrillers,category_Translations,category_Typography,category_Vegan,category_Video,category_Video Art,category_Video Games,category_Wearables,category_Weaving,category_Web,category_Webcomics,category_Webseries,category_Woodworking,category_Workshops,category_World Music,category_Young Adult,category_Zines,currency_AUD,currency_CAD,currency_CHF,currency_DKK,currency_EUR,currency_GBP,currency_HKD,currency_JPY,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD
0,0,-0.036049,1.970001,0,-0.201757,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,-0.010379,2.048659,1,0.675355,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0.003147,0.868789,0,-1.266821,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,-0.032923,-0.311082,1,0.925958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,1,0.007656,0.082208,0,-0.890916,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### （復習）DAY1：ロジスティック回帰（penalty無し）

In [10]:
y = df2["state"].values
X = df2.drop(["state"],axis=1).values # 前回は"state"を説明変数から除外し忘れていた
clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
clf.fit(X, y)
display(df2["state"].value_counts())

0    197716
1    133956
Name: state, dtype: int64

In [11]:
display(y.shape)
display(X.shape)

(331672,)

(331672, 177)

In [12]:
# ラベルを予測
y_est = clf.predict(X)

# 対数尤度を表示
print('対数尤度 = {:.3f}'.format(- log_loss(y, y_est)))


#予測値と正解のクロス集計
conf_mat = pd.DataFrame(confusion_matrix(y, y_est), 
                        index=['正解 = 成功', '正解 = 失敗'], 
                        columns=['予測 = 成功', '予測 = 失敗'])
display(conf_mat)

# 正答率を計算
accuracy =  accuracy_score(y, y_est)
print('正答率（Accuracy） = {:.3f}%'.format(100 * accuracy))

# Precision, Recall, F1-scoreを計算
precision, recall, f1_score, _ = precision_recall_fscore_support(y, y_est)
print('適合率（Precision） = {:.3f}%'.format(100 * precision[1]))
print('再現率（Recall） = {:.3f}%'.format(100 * recall[1]))
print('F1値（F1-score） = {:.3f}%'.format(100 * f1_score[1]))

対数尤度 = -11.355


Unnamed: 0,予測 = 成功,予測 = 失敗
正解 = 成功,153325,44391
正解 = 失敗,64646,69310


正答率（Accuracy） = 67.125%
適合率（Precision） = 60.958%
再現率（Recall） = 51.741%
F1値（F1-score） = 55.973%


### 汎化性能を確認（ホールドアウト法）

In [13]:
### ホールドアウト法
test_size = 0.7 # SVM対応で70%に減らした
Xh_train, Xh_test, yh_train, yh_test = train_test_split(X, y, test_size=test_size, random_state=1234)
clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
clf.fit(Xh_train, yh_train)

## 学習用データに対する予測を実行
yh_pred_train = clf.predict(Xh_train)

## 訓練誤差の評価
print("訓練誤差")
# 対数尤度を表示
print('対数尤度 = {:.3f}'.format(- log_loss(yh_train, yh_pred_train)))

#予測値と正解のクロス集計
conf_mat = pd.DataFrame(confusion_matrix(yh_train, yh_pred_train), 
                        index=['正解 = 成功', '正解 = 失敗'], 
                        columns=['予測 = 成功', '予測 = 失敗'])
display(conf_mat)

# 正答率を計算
accuracy =  accuracy_score(yh_train, yh_pred_train)
print('正答率（Accuracy） = {:.3f}%'.format(100 * accuracy))

# Precision, Recall, F1-scoreを計算
precision, recall, f1_score, _ = precision_recall_fscore_support(yh_train, yh_pred_train)
print('適合率（Precision） = {:.3f}%'.format(100 * precision[1]))
print('再現率（Recall） = {:.3f}%'.format(100 * recall[1]))
print('F1値（F1-score） = {:.3f}%'.format(100 * f1_score[1]))


## テストデータに対する予測を実行
yh_pred_test = clf.predict(Xh_test)

# 汎化誤差の評価
print("汎化誤差")
# 対数尤度を表示
print('対数尤度 = {:.3f}'.format(- log_loss(yh_test, yh_pred_test)))

#予測値と正解のクロス集計
conf_mat = pd.DataFrame(confusion_matrix(yh_test, yh_pred_test), 
                        index=['正解 = 成功', '正解 = 失敗'], 
                        columns=['予測 = 成功', '予測 = 失敗'])
display(conf_mat)

# 正答率を計算
accuracy =  accuracy_score(yh_test, yh_pred_test)
print('正答率（Accuracy） = {:.3f}%'.format(100 * accuracy))

# Precision, Recall, F1-scoreを計算
precision, recall, f1_score, _ = precision_recall_fscore_support(yh_test, yh_pred_test)
print('適合率（Precision） = {:.3f}%'.format(100 * precision[1]))
print('再現率（Recall） = {:.3f}%'.format(100 * recall[1]))
print('F1値（F1-score） = {:.3f}%'.format(100 * f1_score[1]))

訓練誤差
対数尤度 = -11.399


Unnamed: 0,予測 = 成功,予測 = 失敗
正解 = 成功,44466,14887
正解 = 失敗,17951,22197


正答率（Accuracy） = 66.997%
適合率（Precision） = 59.856%
再現率（Recall） = 55.288%
F1値（F1-score） = 57.481%
汎化誤差
対数尤度 = -11.501


Unnamed: 0,予測 = 成功,予測 = 失敗
正解 = 成功,103057,35306
正解 = 失敗,42006,51802


正答率（Accuracy） = 66.700%
適合率（Precision） = 59.469%
再現率（Recall） = 55.221%
F1値（F1-score） = 57.266%


### 汎化性能を確認（CV法）

In [14]:
## CV法
n_split = 5 # グループ数を設定（今回は5分割）
cross_valid_accuracy = 0
cross_valid_precision = 0
cross_valid_recall = 0
cross_valid_f1_score = 0
split_num = 1

# KFold(n_splits=5, *, shuffle=False, random_state=None)
# n_splits : Number of folds. Must be at least 2.
# shuffle : Whether to shuffle the data before splitting into batches. bool, default=False
# method  split(self, X, y=None, groups=None) ; Generate indices to split data into training and test set.
#  X:Training data
#  y:The target variable for supervised learning problems

# for 変数 in オブジェクト:
#     実行する処理

# テスト役を交代させながら学習と評価を繰り返す
for train_idx, test_idx in KFold(n_splits=n_split, shuffle=True, random_state=1234).split(X, y): # 返ってくるのはインデックス
    X_train, y_train = X[train_idx], y[train_idx] #学習用データ
    X_test, y_test = X[test_idx], y[test_idx]     #テスト用データ
    
    print(test_idx)
    
    # 学習用データを使ってロジスティック回帰モデルを学習
    clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
    clf.fit(X_train, y_train)

    # テストデータに対する予測を実行
    y_pred_test = clf.predict(X_test)

    print("Fold %s"%split_num) # %sは文字列として値を該当箇所に挿入するための記号。直後の %(変数名) は%s の場所に挿入する変数を表す

    accuracy =  accuracy_score(y_test, y_pred_test)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_test)
    
    print('正答率（Accuracy） = {:.3f}%'.format(100 * accuracy))
    print('適合率（Precision） = {:.3f}%'.format(100 * precision[1]))
    print('再現率（Recall） = {:.3f}%'.format(100 * recall[1]))
    print('F1値（F1-score） = {:.3f}%'.format(100 * f1_score[1]))
     
    cross_valid_accuracy += accuracy #後で平均を取るために加算
    cross_valid_precision += precision
    cross_valid_recall += recall
    cross_valid_f1_score += f1_score
    split_num += 1

# 平均値を最終的な汎化誤差値とする
final_accuracy = cross_valid_accuracy / n_split
final_precision = cross_valid_precision / n_split
final_recall = cross_valid_recall / n_split
final_f1_score = cross_valid_f1_score / n_split

print('平均 正答率（Accuracy） = {:.3f}%'.format(100 * final_accuracy))
print('平均 適合率（Precision） = {:.3f}%'.format(100 * final_precision[1]))
print('平均 再現率（Recall） = {:.3f}%'.format(100 * final_recall[1]))
print('平均 F1値（F1-score） = {:.3f}%'.format(100 * final_f1_score[1]))

[     9     19     22 ... 331654 331661 331670]
Fold 1
正答率（Accuracy） = 66.922%
適合率（Precision） = 60.170%
再現率（Recall） = 53.494%
F1値（F1-score） = 56.636%
[     0     12     13 ... 331662 331668 331671]
Fold 2
正答率（Accuracy） = 66.891%
適合率（Precision） = 60.987%
再現率（Recall） = 50.365%
F1値（F1-score） = 55.169%
[     4      7     10 ... 331621 331644 331669]
Fold 3
正答率（Accuracy） = 67.323%
適合率（Precision） = 62.019%
再現率（Recall） = 49.235%
F1値（F1-score） = 54.892%
[     2      3     17 ... 331664 331666 331667]
Fold 4
正答率（Accuracy） = 67.109%
適合率（Precision） = 61.079%
再現率（Recall） = 51.595%
F1値（F1-score） = 55.937%
[     1      5      6 ... 331659 331660 331665]
Fold 5
正答率（Accuracy） = 67.202%
適合率（Precision） = 63.366%
再現率（Recall） = 43.949%
F1値（F1-score） = 51.901%
平均 正答率（Accuracy） = 67.089%
平均 適合率（Precision） = 61.524%
平均 再現率（Recall） = 49.728%
平均 F1値（F1-score） = 54.907%


In [15]:
## ホールドアウト法 →　CV法

### 過学習への対処（L1正則化）

In [16]:
# L1(Lasso)で計算 →　Lassoの方がRidgeよりも若干良かった
## CV法
n_split = 5 # グループ数を設定（今回は5分割）
cross_valid_accuracy = 0
cross_valid_precision = 0
cross_valid_recall = 0
cross_valid_f1_score = 0
split_num = 1

# KFold(n_splits=5, *, shuffle=False, random_state=None)
# n_splits : Number of folds. Must be at least 2.
# shuffle : Whether to shuffle the data before splitting into batches. bool, default=False
# method  split(self, X, y=None, groups=None) ; Generate indices to split data into training and test set.
#  X:Training data
#  y:The target variable for supervised learning problems

# for 変数 in オブジェクト:
#     実行する処理

# テスト役を交代させながら学習と評価を繰り返す
for train_idx, test_idx in KFold(n_splits=n_split, shuffle=True, random_state=1234).split(X, y): # 返ってくるのはインデックス
    X_train, y_train = X[train_idx], y[train_idx] #学習用データ
    X_test, y_test = X[test_idx], y[test_idx]     #テスト用データ
    
    print(test_idx)
    
    # 学習用データを使ってロジスティック回帰モデルを学習
    clf = SGDClassifier(loss='log', penalty='l1', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
    clf.fit(X_train, y_train)

    # テストデータに対する予測を実行
    y_pred_test = clf.predict(X_test)

    print("Fold %s"%split_num) # %sは文字列として値を該当箇所に挿入するための記号。直後の %(変数名) は%s の場所に挿入する変数を表す

    accuracy =  accuracy_score(y_test, y_pred_test)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_test)
    
    print('正答率（Accuracy） = {:.3f}%'.format(100 * accuracy))
    print('適合率（Precision） = {:.3f}%'.format(100 * precision[1]))
    print('再現率（Recall） = {:.3f}%'.format(100 * recall[1]))
    print('F1値（F1-score） = {:.3f}%'.format(100 * f1_score[1]))
     
    cross_valid_accuracy += accuracy #後で平均を取るために加算
    cross_valid_precision += precision
    cross_valid_recall += recall
    cross_valid_f1_score += f1_score
    split_num += 1

# 平均値を最終的な汎化誤差値とする
final_accuracy = cross_valid_accuracy / n_split
final_precision = cross_valid_precision / n_split
final_recall = cross_valid_recall / n_split
final_f1_score = cross_valid_f1_score / n_split

print('平均 正答率（Accuracy） = {:.3f}%'.format(100 * final_accuracy))
print('平均 適合率（Precision） = {:.3f}%'.format(100 * final_precision[1]))
print('平均 再現率（Recall） = {:.3f}%'.format(100 * final_recall[1]))
print('平均 F1値（F1-score） = {:.3f}%'.format(100 * final_f1_score[1]))

[     9     19     22 ... 331654 331661 331670]
Fold 1
正答率（Accuracy） = 66.760%
適合率（Precision） = 60.523%
再現率（Recall） = 50.847%
F1値（F1-score） = 55.265%
[     0     12     13 ... 331662 331668 331671]
Fold 2
正答率（Accuracy） = 66.761%
適合率（Precision） = 61.648%
再現率（Recall） = 47.171%
F1値（F1-score） = 53.447%
[     4      7     10 ... 331621 331644 331669]
Fold 3
正答率（Accuracy） = 67.021%
適合率（Precision） = 62.573%
再現率（Recall） = 45.629%
F1値（F1-score） = 52.774%
[     2      3     17 ... 331664 331666 331667]
Fold 4
正答率（Accuracy） = 66.851%
適合率（Precision） = 61.444%
再現率（Recall） = 48.536%
F1値（F1-score） = 54.232%
[     1      5      6 ... 331659 331660 331665]
Fold 5
正答率（Accuracy） = 66.838%
適合率（Precision） = 63.855%
再現率（Recall） = 40.640%
F1値（F1-score） = 49.668%
平均 正答率（Accuracy） = 66.846%
平均 適合率（Precision） = 62.008%
平均 再現率（Recall） = 46.565%
平均 F1値（F1-score） = 53.077%


### 過学習への対処（L2正則化）

In [17]:
# L2(Ridge)で計算
## CV法
n_split = 5 # グループ数を設定（今回は5分割）
cross_valid_accuracy = 0
cross_valid_precision = 0
cross_valid_recall = 0
cross_valid_f1_score = 0
split_num = 1

# KFold(n_splits=5, *, shuffle=False, random_state=None)
# n_splits : Number of folds. Must be at least 2.
# shuffle : Whether to shuffle the data before splitting into batches. bool, default=False
# method  split(self, X, y=None, groups=None) ; Generate indices to split data into training and test set.
#  X:Training data
#  y:The target variable for supervised learning problems

# for 変数 in オブジェクト:
#     実行する処理

# テスト役を交代させながら学習と評価を繰り返す
for train_idx, test_idx in KFold(n_splits=n_split, shuffle=True, random_state=1234).split(X, y): # 返ってくるのはインデックス
    X_train, y_train = X[train_idx], y[train_idx] #学習用データ
    X_test, y_test = X[test_idx], y[test_idx]     #テスト用データ
    
    print(test_idx)
    
    # 学習用データを使ってロジスティック回帰モデルを学習
    clf = SGDClassifier(loss='log', penalty='l2', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
    clf.fit(X_train, y_train)

    # テストデータに対する予測を実行
    y_pred_test = clf.predict(X_test)

    print("Fold %s"%split_num) # %sは文字列として値を該当箇所に挿入するための記号。直後の %(変数名) は%s の場所に挿入する変数を表す

    accuracy =  accuracy_score(y_test, y_pred_test)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_test)
    
    print('正答率（Accuracy） = {:.3f}%'.format(100 * accuracy))
    print('適合率（Precision） = {:.3f}%'.format(100 * precision[1]))
    print('再現率（Recall） = {:.3f}%'.format(100 * recall[1]))
    print('F1値（F1-score） = {:.3f}%'.format(100 * f1_score[1]))
     
    cross_valid_accuracy += accuracy #後で平均を取るために加算
    cross_valid_precision += precision
    cross_valid_recall += recall
    cross_valid_f1_score += f1_score
    split_num += 1

# 平均値を最終的な汎化誤差値とする
final_accuracy = cross_valid_accuracy / n_split
final_precision = cross_valid_precision / n_split
final_recall = cross_valid_recall / n_split
final_f1_score = cross_valid_f1_score / n_split

print('平均 正答率（Accuracy） = {:.3f}%'.format(100 * final_accuracy))
print('平均 適合率（Precision） = {:.3f}%'.format(100 * final_precision[1]))
print('平均 再現率（Recall） = {:.3f}%'.format(100 * final_recall[1]))
print('平均 F1値（F1-score） = {:.3f}%'.format(100 * final_f1_score[1]))

[     9     19     22 ... 331654 331661 331670]
Fold 1
正答率（Accuracy） = 66.552%
適合率（Precision） = 60.383%
再現率（Recall） = 49.914%
F1値（F1-score） = 54.652%
[     0     12     13 ... 331662 331668 331671]
Fold 2
正答率（Accuracy） = 66.585%
適合率（Precision） = 61.368%
再現率（Recall） = 46.936%
F1値（F1-score） = 53.191%
[     4      7     10 ... 331621 331644 331669]
Fold 3
正答率（Accuracy） = 66.844%
適合率（Precision） = 62.329%
再現率（Recall） = 45.237%
F1値（F1-score） = 52.425%
[     2      3     17 ... 331664 331666 331667]
Fold 4
正答率（Accuracy） = 66.675%
適合率（Precision） = 61.336%
再現率（Recall） = 47.735%
F1値（F1-score） = 53.687%
[     1      5      6 ... 331659 331660 331665]
Fold 5
正答率（Accuracy） = 66.774%
適合率（Precision） = 63.946%
再現率（Recall） = 40.070%
F1値（F1-score） = 49.268%
平均 正答率（Accuracy） = 66.686%
平均 適合率（Precision） = 61.872%
平均 再現率（Recall） = 45.979%
平均 F1値（F1-score） = 52.645%


### DAY2：SVM

In [None]:
# SVMの学習実行
C = 5
kernel = "rbf"
gamma = 1
clf_SVM = SVC(C=C, kernel=kernel, gamma=gamma)
# SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, 
# class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
# gamma は、例えばガウスカーネルのスケールパラメータ（正規分布の標準偏差に対応するもの）を指定する
clf_SVM.fit(Xh_train, yh_train)

# 未知のデータを識別する
yh_pred_test = clf_SVM.predict(Xh_test)

### ハイパーパラメータの探索

In [None]:
parameters = {'kernel':['linear', 'rbf'], 'C':[1, 5]} # ここを編集する
model = SVC(gamma="scale") # gamma=scaleにすると、gammaが自動で計算される
clf_SVM2 = GridSearchCV(model, parameters, cv=3,)
clf_SVM2.fit(Xh_train, yh_train)
print(clf_SVM2.best_params_, clf_SVM2.best_score_)