## 1. ライブラリのインストール

In [64]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix

## 2.データセットの読み込み

### パラメータ一覧
* ID
* name　プロジェクト名
* category/main_category　対象カテゴリー
* currency/country　国と通貨
* deadline/launched　募集開始日と終了日
* goal　目標金額
* backers  支援者数
* usd_pledged_real 誓約金(集まった金額？)
* usd_goal_real 目標金額をUSDに変換したもの

df_kick=pd.read_csv("../1_data/ks-projects-201801.csv")

In [65]:
#　大まかなデータの概要の把握
display(df_kick.head())
display(df_kick.describe())
df_kick.info()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real
count,378661.0,378661.0,378661.0,378661.0,374864.0,378661.0,378661.0
mean,1074731000.0,49080.79,9682.979,105.617476,7036.729,9058.924,45454.4
std,619086200.0,1183391.0,95636.01,907.185035,78639.75,90973.34,1152950.0
min,5971.0,0.01,0.0,0.0,0.0,0.0,0.01
25%,538263500.0,2000.0,30.0,2.0,16.98,31.0,2000.0
50%,1075276000.0,5200.0,620.0,12.0,394.72,624.33,5500.0
75%,1610149000.0,16000.0,4076.0,56.0,3034.09,4050.0,15500.0
max,2147476000.0,100000000.0,20338990.0,219382.0,20338990.0,20338990.0,166361400.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [66]:
df_kick['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [67]:
df_kick['main_category'].value_counts()

Film & Video    63585
Music           51918
Publishing      39874
Games           35231
Technology      32569
Design          30070
Art             28153
Food            24602
Fashion         22816
Theater         10913
Comics          10819
Photography     10779
Crafts           8809
Journalism       4755
Dance            3768
Name: main_category, dtype: int64

## 3. 解析の方向性を考える

### 扱う説明変数
* deadline/launched(募集開始日と終了日) : 募集期間として算出しなおして、使用する
* usd_real_goal(目標金額)
* backers(支援者数)

### 学習に使用する目的変数
* 一旦、'state'が'successful'か'failed'になっているもので学習する

In [122]:
# 'state'が'successful'もしくは'failed'の行を抽出する
df_kick_analysis1=df_kick[(df_kick['state']=='successful')|(df_kick['state']=='failed')]

# successfulを1に、failedを0に変更する
df_kick_analysis1.loc[df_kick_analysis1['state']=='successful','state']=1
df_kick_analysis1.loc[df_kick_analysis1['state']=='failed','state']=0

# 使わない変数の列を削除する
# ID,name,category,currency,country,pledged,usd pledged,goal
drop_col=['ID','name','category','currency','country','pledged','usd pledged','goal','usd_pledged_real']
df_kick_analysis1_drop=df_kick_analysis1.drop(drop_col,axis=1)

# 募集期間の算出
dt1=pd.to_datetime(df_kick_analysis1_drop['deadline'])
dt2=pd.to_datetime(df_kick_analysis1_drop['launched'])
dt_terms=dt1-dt2

#'terms'として新規に列を挿入する
# dt.daysで日数だけを取り出した
df_kick_analysis1_drop['terms']=dt_terms.dt.days

drop_col2=['deadline','launched']
df_kick_anyl_vf=df_kick_analysis1_drop.drop(drop_col2,axis=1)
df_kick_anyl_vf

Unnamed: 0,main_category,state,backers,usd_goal_real,terms
0,Publishing,0,0,1533.95,58
1,Film & Video,0,15,30000.00,59
2,Film & Video,0,3,45000.00,44
3,Music,0,1,5000.00,29
5,Food,1,224,50000.00,34
...,...,...,...,...,...
378654,Food,0,4,6500.00,29
378657,Film & Video,0,5,1500.00,26
378658,Film & Video,0,1,15000.00,45
378659,Technology,0,6,15000.00,30


## 4. Scikit-learnで学習モデルを作成する

In [126]:
# Fitting
y=df_kick_anyl_vf['state']
X= df_kick_anyl_vf[['backers','usd_goal_real','terms']].values

clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
clf.fit(X, y)

# 重みを取得して表示する
w0 = clf.intercept_[0]
w1 = clf.coef_[0, 0]
w2 = clf.coef_[0, 1]
w3 = clf.coef_[0, 2]

print('w0 = {:.3f}, w1 = {:.3f}, w2 = {:.3f}, w3 = {:.3f}'.format(w0, w1, w2, w3))

w0 = 68.680, w1 = 1638886.608, w2 = -7779.981, w3 = -10724.033


In [127]:
# ラベルを予測
y_est = clf.predict(X)

# 対数尤度を表示
print('対数尤度 = {:.3f}'.format(- log_loss(y, y_est)))

# 正答率を表示
print('正答率 = {:.3f}%'.format(100 * accuracy_score(y, y_est)))

対数尤度 = -3.325
正答率 = 90.372%


In [129]:
# 予測値と正解のクロス集計
conf_mat = pd.DataFrame(confusion_matrix(y, y_est), 
                        index=['正解 = 失敗', '正解 = 成功'], 
                        columns=['予測 = 失敗', '予測 = 成功'])
conf_mat

Unnamed: 0,予測 = 失敗,予測 = 成功
正解 = 失敗,171817,25902
正解 = 成功,6031,127925
