# 前提

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from keras.utils import np_utils 
import tqdm

In [11]:
pd.set_option('display.max_rows',10)
pd.set_option('display.max_columns', 10)

## モデル

In [2]:
class MatrixFactorization():
    def __init__(self,R,lamda,k=30, steps=200, alpha=0.001,threshold=0.001):
        self.R = R
        self.m = R.shape[0]#Rの列数
        self.n = R.shape[1]#Rの行数
        self.X=np.arange(0,self.m)
        self.Y=np.arange(0,self.n)
        self.k = k #潜在因子分析の次元
        # initializa U and V
        self.U = np.random.rand(self.m, self.k)#0.0から1の範囲でm×k行列の乱数を発生させる
        self.V = np.random.rand(self.k, self.n)#0.0から1の範囲でk×n行列の乱数を発生させる
        self.alpha = alpha #学習率
        self.lamda = lamda #正則化項の係数
        self.threshold = threshold #誤差の閾値
        self.steps = steps #シャッフルする回数
        
    def shuffle_in_unison_scary(self, a, b):
        rng_state = np.random.get_state()
        np.random.shuffle(a)
        np.random.set_state(rng_state)
        np.random.shuffle(b)


    def fit(self):
        for step in tqdm.tqdm(range(self.steps)):
            error = 0
            self.shuffle_in_unison_scary(self.X,self.Y)
            for i in self.X:
                for j in self.Y:
                    r_ij = self.R[i,j] #r_ij=R[i,j]とする
                    if r_ij != 0: #Rの要素が0以外の値をとる場合更新する
                        err_ij = r_ij - np.dot(self.U[i,:], self.V[:,j]) #誤差err_ijはr_ijからUのi-1行目とVのj-1列目をかけあわせたものを引いた値
                        for q in range(self.k):#潜在因子分析の次元数であるk回以下を繰り返す
                            delta_u = self.alpha * (err_ij * self.V[q, j] - self.lamda * self.U[i, q])
                            delta_v = self.alpha * (err_ij * self.U[i, q] - self.lamda * self.V[q, j])
                            self.U[i, q] += delta_u
                            self.V[q, j] += delta_v

            R_hat = np.dot(self.U, self.V) #UとVを掛け合わせたものがR_hatになる
            for i in self.X:
                for j in self.Y:
                    r_ij = self.R[i, j]
                    r_hat_ij = R_hat[i, j]
                    if r_ij > 0:
                        error += pow(r_ij - r_hat_ij,2)/2
            error += (self.lamda * np.power(self.U,2).sum()) / 2
            error += (self.lamda * np.power(self.V,2).sum()) / 2

            if error < self.threshold:
                break
        return self.U, self.V
    

In [3]:
def MSE(ar_original,ar_R_hat):#元の欠損する前の行列(array型),欠損した行列,潜在因子行列をかけた行列
    total=0
    x=ar_original.shape[0]
    y=ar_original.shape[1]
    count=x*y
    X=list(range(ar_original.shape[0]))
    Y=list(range(ar_original.shape[1]))
    for i in X:
        for j in Y:    
            total+=(ar_original[i,j]-ar_R_hat[i,j])**2
    score=total/count
    return score

## データ

In [4]:
df_U_re=pd.read_csv("/Users/kiyopippi/Desktop/卒業研究/データ/df_U1_DCALF-A.csv")
df_U_re_original=df_U_re.copy()
df_U_re=df_U_re.drop('Unnamed: 0', axis=1)
df_U_re

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,clust_label
0,-0.776194,0.437198,-0.103642,0.019767,0.634932,0.171638,0.533995,0.321378,0.127031,0.248172,...,0.426148,0.040006,1.031746,0.043912,-0.154125,-0.115832,-0.274640,0.478683,0.204875,0
1,-0.748278,0.090051,0.653877,0.444950,0.499740,-0.257221,0.676333,0.037940,-0.017507,0.210747,...,-0.054997,0.682403,0.057232,0.284580,-0.162979,-0.035921,-0.254125,-0.179457,-0.593319,0
2,-0.542917,0.299612,0.057694,1.027113,0.425275,-0.107820,0.559480,0.380643,0.245949,0.661377,...,0.207088,-0.106474,0.859294,-0.397227,-0.512248,-0.120949,0.117253,0.054528,0.091408,0
3,-0.298467,0.000543,0.716676,0.383240,0.187477,-0.137331,0.718066,0.272754,0.048956,1.007064,...,-0.029717,0.204901,0.113503,-0.367598,-0.396539,-0.257223,-0.369554,0.306784,0.059522,0
4,-0.463366,0.205048,0.688357,0.217038,0.566007,0.301679,0.918863,0.626184,0.215334,0.127751,...,0.145664,-0.497055,0.127842,0.052727,-0.042161,0.447632,-0.285111,0.422990,-0.551051,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.955974,-0.083020,0.033183,-0.152089,0.366250,0.506952,0.393748,0.567711,0.009113,-0.243024,...,-0.414320,-0.066834,-0.833512,0.045019,0.791632,0.431426,1.423838,0.140888,0.461703,1
96,0.879364,-1.048718,-0.643509,-0.322638,-0.586922,0.448621,0.105858,-0.088655,-0.489616,0.022569,...,0.020928,0.514098,-0.777845,1.011523,1.568625,0.393459,1.392581,0.191263,1.598570,1
97,0.691868,-0.350752,-0.780082,-0.059912,0.147426,1.010726,-0.211507,-0.293025,-0.094924,0.463143,...,0.347249,-0.058773,-0.364711,0.969961,0.163248,-0.332458,-0.030262,-0.495850,1.055778,1
98,1.334958,0.091285,0.082992,0.204670,-0.240532,0.268468,0.293798,-0.260933,0.193247,0.170408,...,-0.183126,-0.337216,-0.284936,0.797769,0.611305,0.052040,1.473372,0.592867,0.542448,1


In [5]:
df_V_re=pd.read_csv("/Users/kiyopippi/Desktop/卒業研究/データ/df_V1_DCALF-A.csv")
df_V_re_original=df_U_re.copy()
df_V_re=df_V_re.drop('Unnamed: 0', axis=1)
df_V_re

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,clust_label
0,-0.613753,0.072744,0.810574,0.049188,0.691440,0.015770,0.445961,-0.325498,0.197818,0.450871,...,0.286020,-0.186250,0.293778,-0.494226,-0.316027,0.006942,-0.723799,-0.022296,-0.523111,0
1,-0.947459,-0.039099,0.276624,0.242442,0.159543,0.145566,0.274417,-0.261109,-0.394931,0.599935,...,0.298961,-0.489198,0.677442,-0.451681,-0.290980,0.374063,-0.231210,0.063485,-0.401356,0
2,-0.785065,0.103358,0.297365,0.229406,0.488954,-0.005628,-0.148089,0.268033,0.241894,0.414650,...,0.372329,0.199757,1.066403,-0.211720,-0.210430,-0.005365,-0.571408,0.058975,-0.606770,0
3,-0.432110,-0.151818,0.314840,0.042047,0.112829,-0.255634,0.506116,-0.045288,-0.045316,0.250611,...,0.057193,0.377503,0.616692,-0.756610,-0.609701,-0.216942,-0.420093,0.510422,-0.690833,0
4,-0.494108,-0.153752,0.666719,0.228438,0.276342,-0.243740,0.134057,-0.136799,-0.083217,0.132662,...,0.141561,0.271044,0.546138,-0.522083,-0.232905,-0.031885,-0.362749,0.457788,-0.139153,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.549235,0.206286,0.257430,-0.869653,0.433207,-0.272453,-0.484729,0.419447,0.277163,0.135272,...,0.343337,0.584292,-0.539992,0.698753,0.939735,0.169263,0.957500,0.323821,1.381395,1
96,0.360736,0.329806,-0.030740,-0.044440,0.097199,-0.404124,-0.742100,-0.067524,0.701303,-0.290360,...,-0.013651,-0.220286,-0.404972,-0.262210,1.328578,0.933200,0.134247,-0.348241,-0.260691,1
97,1.834279,0.175760,-0.234187,0.275187,0.104996,0.202332,-1.156525,-0.450200,-0.560837,-1.228340,...,0.218936,0.288273,-0.151246,0.416205,1.178920,0.755391,0.472070,0.171996,0.624459,1
98,0.441643,0.131161,-0.241934,-0.252010,-0.882369,-0.400748,-0.696318,-0.090696,-0.105356,-0.355699,...,0.103083,0.697143,-0.221526,0.162132,1.016956,0.212997,-0.327472,0.540880,0.141378,1


In [6]:
df_R=pd.read_csv("/Users/kiyopippi/Desktop/卒業研究/データ/df_R_original.csv")
df_R=df_R.copy()
df_R=df_R.drop('Unnamed: 0', axis=1)
df_R

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.559714,0.812428,0.710206,0.651993,0.609536,0.617877,0.532940,0.652764,0.800127,0.564389,...,-0.761396,-0.846736,-0.750626,-0.739786,-0.656829,-0.811927,-0.697173,-0.530023,-0.796716,-0.698486
1,0.572032,0.764582,0.628221,0.704220,0.611889,0.508992,0.656912,0.672572,0.853526,0.603080,...,-0.732137,-0.830688,-0.646520,-0.766070,-0.651219,-0.687922,-0.877642,-0.569005,-0.814217,-0.617772
2,0.502967,0.802246,0.798070,0.826225,0.577499,0.674849,0.529169,0.704960,0.848397,0.700795,...,-0.734004,-0.807551,-0.746108,-0.817309,-0.597117,-0.691291,-0.713042,-0.610497,-0.778149,-0.668229
3,0.700987,0.821149,0.705587,0.649822,0.620625,0.658104,0.757899,0.810213,0.872576,0.745002,...,-0.799188,-0.742605,-0.767490,-0.731359,-0.628645,-0.696565,-0.753379,-0.603139,-0.851030,-0.712632
4,0.611890,0.763657,0.762339,0.757046,0.700803,0.614265,0.778005,0.723566,0.826377,0.707041,...,-0.806639,-0.780769,-0.693236,-0.698622,-0.667373,-0.724201,-0.775825,-0.650256,-0.877638,-0.688848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.526550,-0.699092,-0.626518,-0.675721,-0.539126,-0.498345,-0.516883,-0.634862,-0.754860,-0.490528,...,0.615802,0.724393,0.638425,0.675730,0.577910,0.634300,0.744576,0.488157,0.763653,0.666918
96,-0.476981,-0.640180,-0.531044,-0.595505,-0.520765,-0.434697,-0.518355,-0.522645,-0.727183,-0.504123,...,0.527862,0.572529,0.605004,0.515155,0.547704,0.524223,0.600272,0.464039,0.620292,0.542993
97,-0.499173,-0.585452,-0.621316,-0.646289,-0.548299,-0.459862,-0.554733,-0.489924,-0.567417,-0.521873,...,0.560967,0.527617,0.483696,0.622670,0.559625,0.620760,0.630617,0.518751,0.678182,0.463026
98,-0.454323,-0.705649,-0.647501,-0.634193,-0.610820,-0.564885,-0.606935,-0.607800,-0.735269,-0.504920,...,0.588897,0.677367,0.592426,0.629502,0.552602,0.687910,0.663248,0.554901,0.623403,0.572316


In [7]:
df_R_missing=pd.read_csv("/Users/kiyopippi/Desktop/卒業研究/データ/df_R__missing_original.csv")
df_R_missing=df_R_missing.drop('Unnamed: 0', axis=1)
df_R_missing_original=df_R_missing.copy()
df_R_missing

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,,0.710206,,,,,,,,...,,,,,-0.656829,,-0.697173,,-0.796716,
1,0.572032,,,,,,0.656912,,0.853526,0.603080,...,,-0.830688,,,,,,,,
2,,,,0.826225,,0.674849,,,,0.700795,...,-0.734004,,-0.746108,,,,-0.713042,,,
3,,,,,0.620625,,,0.810213,,0.745002,...,,,,,,,,-0.603139,-0.851030,-0.712632
4,0.611890,0.763657,,0.757046,,,,,0.826377,,...,,,,,-0.667373,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,-0.699092,,-0.675721,-0.539126,,-0.516883,-0.634862,,-0.490528,...,0.615802,,,,,,,,0.763653,0.666918
96,,,-0.531044,,,-0.434697,,,-0.727183,,...,,,0.605004,,0.547704,,,0.464039,,
97,,-0.585452,,-0.646289,,-0.459862,-0.554733,,-0.567417,-0.521873,...,0.560967,0.527617,,,0.559625,0.62076,0.630617,,,
98,,,,-0.634193,-0.610820,-0.564885,,,,-0.504920,...,,,,0.629502,,,,0.554901,,


# 分析

## ユーザーのクラスター

In [8]:
df_R_missing["clust_label"]=df_U_re["clust_label"]
df_R_missing

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,clust_label
0,,,0.710206,,,,,,,,...,,,,-0.656829,,-0.697173,,-0.796716,,0
1,0.572032,,,,,,0.656912,,0.853526,0.603080,...,-0.830688,,,,,,,,,0
2,,,,0.826225,,0.674849,,,,0.700795,...,,-0.746108,,,,-0.713042,,,,0
3,,,,,0.620625,,,0.810213,,0.745002,...,,,,,,,-0.603139,-0.851030,-0.712632,0
4,0.611890,0.763657,,0.757046,,,,,0.826377,,...,,,,-0.667373,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,-0.699092,,-0.675721,-0.539126,,-0.516883,-0.634862,,-0.490528,...,,,,,,,,0.763653,0.666918,1
96,,,-0.531044,,,-0.434697,,,-0.727183,,...,,0.605004,,0.547704,,,0.464039,,,1
97,,-0.585452,,-0.646289,,-0.459862,-0.554733,,-0.567417,-0.521873,...,0.527617,,,0.559625,0.62076,0.630617,,,,1
98,,,,-0.634193,-0.610820,-0.564885,,,,-0.504920,...,,,0.629502,,,,0.554901,,,1


In [12]:
df_clust0=df_R_missing[df_R_missing["clust_label"]==0]
df_clust0=df_clust0.iloc[:,:100]
df_clust0

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
0,,,0.710206,,,...,,-0.697173,,-0.796716,
1,0.572032,,,,,...,,,,,
2,,,,0.826225,,...,,-0.713042,,,
3,,,,,0.620625,...,,,-0.603139,-0.851030,-0.712632
4,0.611890,0.763657,,0.757046,,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
45,0.356379,,0.425630,,,...,,,,-0.627046,
46,,,,,,...,,-0.746277,,,-0.609819
47,,,,,0.498504,...,-0.576253,,-0.483290,,-0.622192
48,0.649502,,0.775132,,0.569830,...,-0.695583,,,,


In [13]:
df_clust1=df_R_missing[df_R_missing["clust_label"]==1]
df_clust1=df_clust1.iloc[:,:100]
df_clust1

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
50,,-0.634819,,,,...,,0.658640,,,
51,,,,,,...,,0.724786,,0.663412,0.609895
52,,,,-0.783433,,...,,0.831822,0.554434,0.872717,
53,,,,-0.615423,,...,,,,,0.486473
54,,,,,-0.656588,...,,,,,0.547193
...,...,...,...,...,...,...,...,...,...,...,...
95,,-0.699092,,-0.675721,-0.539126,...,,,,0.763653,0.666918
96,,,-0.531044,,,...,,,0.464039,,
97,,-0.585452,,-0.646289,,...,0.62076,0.630617,,,
98,,,,-0.634193,-0.610820,...,,,0.554901,,


In [14]:
df_clust0=df_clust0.fillna(0) #欠損値を０で穴埋め
ar_clust0=df_clust0.values #データフレームをarray型に変換
df_clust1=df_clust1.fillna(0) #欠損値を０で穴埋め
ar_clust1=df_clust1.values #データフレームをarray型に変換
score_lis=[]
for lam in [0.1,0.01,0.001]:
    mf0 = MatrixFactorization(ar_clust0,lam)
    U0,V0=mf0.fit()
    R_hat0=pd.DataFrame(np.dot(U0,V0))
    R_hat0.index=df_clust0.index
    mf1 = MatrixFactorization(ar_clust1,lam) 
    U1,V1=mf1.fit()
    ar_Rhat1=np.dot(U1,V1)
    R_hat1=pd.DataFrame(np.dot(U1,V1))
    R_hat1.index=df_clust1.index
    R_hat=pd.concat([R_hat0,R_hat1])
    R_hat=R_hat.sort_index()
    ar_R=df_R.values
    ar_R_hat=R_hat.values
    score=MSE(ar_R,ar_R_hat)
    score_lis.append(score)
score_lis

100%|██████████| 200/200 [00:26<00:00,  7.46it/s]
100%|██████████| 200/200 [00:29<00:00,  6.83it/s]
100%|██████████| 200/200 [00:31<00:00,  6.41it/s]
100%|██████████| 200/200 [00:30<00:00,  6.51it/s]
100%|██████████| 200/200 [00:31<00:00,  6.29it/s]
100%|██████████| 200/200 [00:36<00:00,  5.53it/s]


[0.1819658578926847, 0.24609669270758192, 0.25190511139138183]

## サービスのクラスター

In [15]:
df_R_missing1=df_R_missing_original.T
df_R_missing1

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
0,,0.572032,,,0.611890,...,,,,,
1,,,,,0.763657,...,-0.699092,,-0.585452,,-0.768439
2,0.710206,,,,,...,,-0.531044,,,-0.783827
3,,,0.826225,,0.757046,...,-0.675721,,-0.646289,-0.634193,
4,,,,0.620625,,...,-0.539126,,,-0.610820,
...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,...,,,0.620760,,
96,-0.697173,,-0.713042,,,...,,,0.630617,,0.805455
97,,,,-0.603139,,...,,0.464039,,0.554901,
98,-0.796716,,,-0.851030,,...,0.763653,,,,


In [16]:
df_R_missing1["clust_label"]=list(df_V_re["clust_label"])
df_R_missing1

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,clust_label
0,,0.572032,,,0.611890,...,,,,,0
1,,,,,0.763657,...,,-0.585452,,-0.768439,0
2,0.710206,,,,,...,-0.531044,,,-0.783827,0
3,,,0.826225,,0.757046,...,,-0.646289,-0.634193,,0
4,,,,0.620625,,...,,,-0.610820,,0
...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,...,,0.620760,,,1
96,-0.697173,,-0.713042,,,...,,0.630617,,0.805455,1
97,,,,-0.603139,,...,0.464039,,0.554901,,1
98,-0.796716,,,-0.851030,,...,,,,,1


In [17]:
df_clust0=df_R_missing1[df_R_missing1["clust_label"]==0]
df_clust0=df_clust0.iloc[:,:100]
df_clust0

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
0,,0.572032,,,0.611890,...,,,,,
1,,,,,0.763657,...,-0.699092,,-0.585452,,-0.768439
2,0.710206,,,,,...,,-0.531044,,,-0.783827
3,,,0.826225,,0.757046,...,-0.675721,,-0.646289,-0.634193,
4,,,,0.620625,,...,-0.539126,,,-0.610820,
...,...,...,...,...,...,...,...,...,...,...,...
45,,0.564108,,0.689892,,...,-0.520904,-0.502179,-0.453711,-0.546441,
46,,,,,,...,,,-0.549567,,
47,0.676955,,0.631003,,,...,-0.471605,-0.514629,,-0.616416,
48,,0.632221,0.649288,,,...,,,,,


In [18]:
df_clust1=df_R_missing1[df_R_missing1["clust_label"]==1]
df_clust1=df_clust1.iloc[:,:100]
df_clust1

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
50,-0.731943,,-0.676416,,-0.692846,...,0.640735,,,0.618869,0.781424
51,-0.650715,-0.599435,-0.773042,,-0.741329,...,,0.408100,,,0.723448
52,,,,,-0.764549,...,0.654379,0.603206,,,0.670692
53,,,,-0.625984,,...,0.655123,,,,
54,-0.507079,,,-0.465597,-0.586124,...,,,,,0.482578
...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,...,,,0.620760,,
96,-0.697173,,-0.713042,,,...,,,0.630617,,0.805455
97,,,,-0.603139,,...,,0.464039,,0.554901,
98,-0.796716,,,-0.851030,,...,0.763653,,,,


In [19]:
df_clust0=df_clust0.fillna(0) #欠損値を０で穴埋め
ar_clust0=df_clust0.values #データフレームをarray型に変換
df_clust1=df_clust1.fillna(0) #欠損値を０で穴埋め
ar_clust1=df_clust1.values #データフレームをarray型に変換
score_lis=[]
for lam in [0.1,0.01,0.001]:
    mf0 = MatrixFactorization(ar_clust0,lam)
    U0,V0=mf0.fit()
    R_hat0=pd.DataFrame(np.dot(U0,V0))
    R_hat0.index=df_clust0.index
    mf1 = MatrixFactorization(ar_clust1,lam) 
    U1,V1=mf1.fit()
    ar_Rhat1=np.dot(U1,V1)
    R_hat1=pd.DataFrame(np.dot(U1,V1))
    R_hat1.index=df_clust1.index
    R_hat=pd.concat([R_hat0,R_hat1])
    R_hat=R_hat.sort_index()
    ar_R=df_R.values
    ar_R_hat=R_hat.values
    score=MSE(ar_R,ar_R_hat)
    score_lis.append(score)
score_lis

100%|██████████| 200/200 [00:28<00:00,  7.10it/s]
100%|██████████| 200/200 [00:30<00:00,  6.55it/s]
100%|██████████| 200/200 [00:31<00:00,  6.40it/s]
100%|██████████| 200/200 [00:31<00:00,  6.25it/s]
100%|██████████| 200/200 [00:31<00:00,  6.29it/s]
100%|██████████| 200/200 [00:31<00:00,  6.40it/s]


[0.2210038539421482, 0.3256227720516851, 0.364649974673905]