In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
customer = pd.read_csv("../input/100honch4/customer_join.csv")
uselog = pd.read_csv("../input/100honch4/use_log.csv")
customer.head()

In [None]:
uselog.head()

In [None]:
uselog.isnull().sum()

In [None]:
customer.isnull().sum()

In [None]:
customer_clustering = customer[["mean", "median", "max", "min", "membership_period"]]
customer_clustering.head()

In [None]:
# K-meansを使うためのライブラリ
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# sklearn.preprocessing にある StandardScaler というライブラリから StandardScaler() というオブジェクトを生成
# StandardScaler() は標準化を行うためのオブジェクト
# fit_transform で パラメータの計算とデータ変換を同時に行って標準化をしてくれる
# 標準化は平均を0、分散を1とするデータ変換

sc = StandardScaler()
customer_clustering_sc = sc.fit_transform(customer_clustering)

# K-means のモデル構築
kmeans = KMeans(n_clusters=4, random_state=0)

#生成したモデルの生地に、標準化したデータを埋め込んでいく
clusters = kmeans.fit(customer_clustering_sc)

#生成したn_clusters数のクラスターにラベリングをする
customer_clustering["cluster"] = clusters.labels_

print(customer_clustering["cluster"].unique())
customer_clustering.head()

In [None]:
customer_clustering.columns= ["月内平均値", "月内中央値", "月内最大値", "月内最小値", "会員期間", "cluster"]

# count()はデータ数を集計する関数
customer_clustering.groupby("cluster").count()

In [None]:
customer_clustering.groupby("cluster").mean()

In [None]:
# 主成分分析を行うためのライブラリ
from sklearn.decomposition import PCA

#主成分分析のモデルの生地
pca = PCA(n_components = 2)
X = customer_clustering_sc

#主成分分析を実行
pca.fit(X)
x_pca = pca.transform(X)

pca_df = pd.DataFrame(x_pca)
pca_df["cluster"] = customer_clustering["cluster"]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

for i in customer_clustering["cluster"].unique():
    tmp = pca_df.loc[pca_df["cluster"]==i]
    plt.scatter(tmp[0], tmp[1])

In [None]:
customer_clustering = pd.concat([customer_clustering, customer], axis=1)
customer_clustering.head()

In [None]:
customer_clustering.groupby(["cluster", "is_deleted"], as_index=False).count()[["cluster", "is_deleted", "customer_id"]]

In [None]:
customer_clustering.groupby(["cluster", "routine_flg"], as_index=False).count()[["cluster", "routine_flg", "customer_id"]]

In [None]:
# 教師あり学習による回帰を用いて予測を行う

uselog["usedate"] = pd.to_datetime(uselog["usedate"])
uselog["年月"] = uselog["usedate"].dt.strftime("%Y%m")
uselog_months = uselog.groupby(["年月", "customer_id"], as_index = False).count()
uselog_months.head()
uselog_months.rename(columns={"log_id":"count"}, inplace=True)
del uselog_months["usedate"]
uselog_months.head()

In [None]:
year_months = list(uselog_months["年月"].unique())　#unique()で、存在する種類のみを取り出す
predict_data = pd.DataFrame() #空のデータフレームを作成

for i in range((int)((len(year_months)+1)/2), len(year_months)): #データのうち、最近の半分を取り出す。
    
    tmp = uselog_months.loc[uselog_months["年月"] == year_months[i]]
    tmp.rename(columns={"count":"count_pred"}, inplace=True)　#回数から予測回数へ
    
    for j in range(1,7): #データから見て過去6か月分のデータで学習を行う
        tmp_before = uselog_months.loc[uselog_months["年月"]==year_months[i-j]]
        del tmp_before["年月"]
        tmp_before.rename(columns={"count": "count_{}".format(j-1)}, inplace=True)
        tmp = pd.merge(tmp, tmp_before, on="customer_id", how="left")
        
    predict_data = pd.concat([predict_data, tmp], ignore_index=True) #縦につなげるときはpd.concatを用いる
    
predict_data.head()

In [None]:
predict_data = predict_data.dropna() #NaNを含むデータを除去
predict_data = predict_data.reset_index(drop=True) #indexを元に戻している
predict_data.head()

In [None]:
predict_data = pd.merge(predict_data, customer[["customer_id", "start_date"]], on="customer_id", how="left")
predict_data.head()

In [None]:
predict_data["now_date"] = pd.to_datetime(predict_data["年月"], format="%Y%m")
predict_data["start_date"] = pd.to_datetime(predict_data["start_date"])

from dateutil.relativedelta import relativedelta #年月の差をとるときには、relativedaltaを用いる
predict_data["period"] = 0

for i in range(len(predict_data)): 
    delta = relativedelta(predict_data["now_date"][i], predict_data["start_date"][i])
    predict_data["period"][i] = delta.years*12 + delta.months
    
predict_data.head()

In [None]:
predict_data = predict_data.loc[predict_data["start_date"] >= pd.to_datetime("20180401")]

from sklearn import linear_model #線形回帰モデルを用いるためのライブラリ
import sklearn.model_selection #データを学習用と評価用に分けるためのライブラリ

model = linear_model.LinearRegression() #線形回帰モデルの作成

X = predict_data[["count_0","count_1","count_2","count_3","count_4","count_5","period"]]
y = predict_data["count_pred"]

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y) #データを学習用と評価用に分割
model.fit(X_train, y_train) #モデルに学習用データを与える

In [None]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

In [None]:
coef = pd.DataFrame({"feature_names":X.columns, "coefficient":model.coef_})
coef

In [None]:
x1=[3,4,4,6,8,7,8]
x2=[2,2,3,3,4,6,8]
x3=[0,0,0,0,0,0,0]

x_pred=[x1,x2,x3]

model.predict(x_pred)

In [None]:
uselog_months.to_csv("use_log_months.csv",index=False)