## 関数

In [94]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [95]:
pd.set_option('display.max_rows',10)
pd.set_option('display.max_columns', 10)

In [96]:
class MatrixFactorization():
    def __init__(self, R, k, steps=200, alpha=0.01, lamda=0.001,threshold=0.001):
        self.R = R
        self.m = R.shape[0]#Rの列数
        self.n = R.shape[1]#Rの行数
        self.X=np.arange(0,self.m)
        self.Y=np.arange(0,self.n)
        self.k = k #潜在因子分析の次元
        # initializa U and V
        self.U = np.random.rand(self.m, self.k)#0.0から1の範囲でm×k行列の乱数を発生させる
        self.V = np.random.rand(self.k, self.n)#0.0から1の範囲でk×n行列の乱数を発生させる
        self.alpha = alpha #学習率
        self.lamda = lamda #正則化項の係数
        self.threshold = threshold #誤差の閾値
        self.steps = steps #シャッフルする回数

    def fit(self):
        for step in range(self.steps):
            error = 0
            for i in self.X:
                for j in self.Y:
                    r_ij = self.R[i,j] #r_ij=R[i,j]とする
                    if r_ij != 0: #Rの要素が0以外の値をとる場合更新する
                        err_ij = r_ij - np.dot(self.U[i,:], self.V[:,j]) #誤差err_ijはr_ijからUのi-1行目とVのj-1列目をかけあわせたものを引いた値
                        for q in range(self.k):#潜在因子分析の次元数であるk回以下を繰り返す
                            self.U[i,q] += self.alpha * (err_ij * self.V[q, j] - self.lamda * self.U[i, q])
                            self.V[q, j] += self.alpha * (err_ij * self.U[i, q] - self.lamda * self.V[q, j])

            R_hat = np.dot(self.U, self.V) #UとVを掛け合わせたものがR_hatになる
            for i in self.X:
                for j in self.Y:
                    r_ij = self.R[i, j]
                    r_hat_ij = R_hat[i, j]
                    if r_ij > 0:
                        error += pow(r_ij - r_hat_ij,2)
            error += (self.lamda * np.power(self.U,2).sum()) / 2
            error += (self.lamda * np.power(self.V,2).sum()) / 2

            if error < self.threshold:
                break
        return self.U, self.V

In [98]:
def RMSE2(ar_original,ar_missing,ar_R_hat):#元の欠損する前の行列(array型),欠損した行列,潜在因子行列をかけた行列
    total=0
    count=0
    X=list(range(ar_original.shape[0]))
    Y=list(range(ar_original.shape[1]))
    for i in X:
        for j in Y:
            if ar_missing[i,j]==0:
                count+=1
                total+=(ar_original[i,j]-ar_R_hat[i,j])**2
    score=np.sqrt(total/count)
    return score,count

## データをインポート

In [117]:
df_U_re=pd.read_csv("/Users/kiyopippi/Desktop/卒業研究/データ/18st_data_clustering_result.csv")
df_U__re_original=df_U_re.copy()
df_U_re

Unnamed: 0.1,Unnamed: 0,X0,X1,X2,X3,...,X57,X58,X59,label,clust_label
0,1,0.352611,0.221457,0.278374,-0.189304,...,0.809577,0.246861,0.117991,0,1
1,2,0.594223,0.149320,-0.318934,0.203477,...,-0.315192,0.087283,0.264603,1,2
2,3,0.560403,0.179770,0.409796,0.435317,...,-0.085027,0.246789,0.157109,1,2
3,4,0.399179,0.605403,0.576510,0.323262,...,0.319544,0.042488,0.602797,1,2
4,5,0.026736,0.536513,0.265868,-0.131524,...,0.101072,-0.358740,0.457957,1,2
...,...,...,...,...,...,...,...,...,...,...,...
95,96,0.728603,0.202183,-0.000970,0.251470,...,0.198515,0.158944,0.186788,1,2
96,97,0.255674,0.233315,0.299031,0.753651,...,0.427993,0.042559,-0.167560,0,1
97,98,0.771718,0.622488,0.503788,0.459139,...,-0.175603,0.416238,-0.041832,0,1
98,99,0.729537,0.481352,0.123163,0.088107,...,0.261352,0.341643,0.440966,1,2


In [118]:
df_missing=pd.read_csv("/Users/kiyopippi/Desktop/卒業研究/データ/19st_data.csv")
df_missing_original=df_missing.copy()
df_missing

Unnamed: 0.1,Unnamed: 0,0,1,2,3,...,95,96,97,98,99
0,0,0.000000,4.624362,0.000000,0.000000,...,0.000000,5.095143,0.000000,0.000000,0.000000
1,1,0.000000,3.797731,0.000000,3.508350,...,0.000000,0.000000,0.000000,5.016462,3.607476
2,2,0.000000,4.642896,3.866896,3.578688,...,2.237827,0.000000,0.000000,5.972514,3.750053
3,3,2.602926,3.813785,0.000000,0.000000,...,2.276721,3.273679,5.073993,0.000000,0.000000
4,4,0.000000,0.000000,2.660976,2.935308,...,0.000000,2.379576,4.629793,0.000000,3.451964
...,...,...,...,...,...,...,...,...,...,...,...
95,95,3.654987,3.678589,0.000000,3.505338,...,1.827807,3.117743,5.220586,4.745609,0.000000
96,96,2.261376,3.917438,1.814948,2.653100,...,0.000000,4.738053,4.942063,1.222991,0.000000
97,97,0.000000,0.000000,0.000000,0.000000,...,1.420774,0.000000,4.587968,0.351126,3.852619
98,98,0.000000,3.144206,0.000000,0.000000,...,2.362820,2.133846,0.000000,0.000000,0.000000


In [121]:
#不要な列を削除
# df_missing=df_missing.drop('Unnamed: 0', axis=1)
#ラベルを付与
df_missing["clust_label"]=df_U_re["clust_label"]
df_missing

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,clust_label
0,0.000000,4.624362,0.000000,0.000000,0.000000,...,5.095143,0.000000,0.000000,0.000000,1
1,0.000000,3.797731,0.000000,3.508350,1.906513,...,0.000000,0.000000,5.016462,3.607476,2
2,0.000000,4.642896,3.866896,3.578688,0.000000,...,0.000000,0.000000,5.972514,3.750053,2
3,2.602926,3.813785,0.000000,0.000000,0.000000,...,3.273679,5.073993,0.000000,0.000000,2
4,0.000000,0.000000,2.660976,2.935308,2.694547,...,2.379576,4.629793,0.000000,3.451964,2
...,...,...,...,...,...,...,...,...,...,...,...
95,3.654987,3.678589,0.000000,3.505338,1.615027,...,3.117743,5.220586,4.745609,0.000000,2
96,2.261376,3.917438,1.814948,2.653100,0.000000,...,4.738053,4.942063,1.222991,0.000000,1
97,0.000000,0.000000,0.000000,0.000000,1.306394,...,0.000000,4.587968,0.351126,3.852619,1
98,0.000000,3.144206,0.000000,0.000000,1.328637,...,2.133846,0.000000,0.000000,0.000000,2


In [122]:
#クラスが1のデータ
df_clust1=df_missing[df_missing["clust_label"]==1]
df_clust1

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,clust_label
0,0.000000,4.624362,0.000000,0.000000,0.000000,...,5.095143,0.000000,0.000000,0.000000,1
5,0.985962,0.000000,2.265864,3.072070,1.789524,...,4.746790,0.000000,0.000000,0.000000,1
6,0.000000,4.350833,2.798831,2.287166,0.000000,...,0.000000,0.000000,0.000000,3.463071,1
10,1.698175,3.485206,1.668686,0.000000,1.375632,...,0.000000,0.000000,0.936818,0.000000,1
12,1.442775,4.499162,0.000000,2.537633,1.315037,...,4.412676,5.369317,-0.561408,3.818937,1
...,...,...,...,...,...,...,...,...,...,...,...
89,0.000000,3.564234,0.000000,0.000000,2.116862,...,4.991815,4.806631,0.000000,3.337899,1
91,0.825360,0.000000,1.633139,2.746113,1.015103,...,0.000000,0.000000,2.018394,4.032086,1
92,2.341381,0.000000,0.000000,0.000000,0.000000,...,4.773393,5.087180,0.268381,3.779137,1
96,2.261376,3.917438,1.814948,2.653100,0.000000,...,4.738053,4.942063,1.222991,0.000000,1


In [123]:
#クラスが2のデータ
df_clust2=df_missing[df_missing["clust_label"]==2]
df_clust2

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,clust_label
1,0.000000,3.797731,0.000000,3.508350,1.906513,...,0.000000,0.000000,5.016462,3.607476,2
2,0.000000,4.642896,3.866896,3.578688,0.000000,...,0.000000,0.000000,5.972514,3.750053,2
3,2.602926,3.813785,0.000000,0.000000,0.000000,...,3.273679,5.073993,0.000000,0.000000,2
4,0.000000,0.000000,2.660976,2.935308,2.694547,...,2.379576,4.629793,0.000000,3.451964,2
7,1.709193,2.979889,0.000000,2.905718,0.000000,...,2.784138,0.000000,5.013505,4.003592,2
...,...,...,...,...,...,...,...,...,...,...,...
93,0.000000,3.331255,0.000000,0.000000,0.000000,...,0.000000,3.731903,4.748727,0.000000,2
94,0.000000,0.000000,3.240282,0.000000,2.650623,...,0.000000,5.359602,0.000000,0.000000,2
95,3.654987,3.678589,0.000000,3.505338,1.615027,...,3.117743,5.220586,4.745609,0.000000,2
98,0.000000,3.144206,0.000000,0.000000,1.328637,...,2.133846,0.000000,0.000000,0.000000,2


## クラスター1に対してLFA

In [125]:
# df_clust1=df_clust1.drop("clust_label",axis=1)#ラベルを落とす
df_clust1

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,clust_label
0,0.000000,4.624362,0.000000,0.000000,0.000000,...,5.095143,0.000000,0.000000,0.000000,1
5,0.985962,0.000000,2.265864,3.072070,1.789524,...,4.746790,0.000000,0.000000,0.000000,1
6,0.000000,4.350833,2.798831,2.287166,0.000000,...,0.000000,0.000000,0.000000,3.463071,1
10,1.698175,3.485206,1.668686,0.000000,1.375632,...,0.000000,0.000000,0.936818,0.000000,1
12,1.442775,4.499162,0.000000,2.537633,1.315037,...,4.412676,5.369317,-0.561408,3.818937,1
...,...,...,...,...,...,...,...,...,...,...,...
89,0.000000,3.564234,0.000000,0.000000,2.116862,...,4.991815,4.806631,0.000000,3.337899,1
91,0.825360,0.000000,1.633139,2.746113,1.015103,...,0.000000,0.000000,2.018394,4.032086,1
92,2.341381,0.000000,0.000000,0.000000,0.000000,...,4.773393,5.087180,0.268381,3.779137,1
96,2.261376,3.917438,1.814948,2.653100,0.000000,...,4.738053,4.942063,1.222991,0.000000,1


In [126]:
#クラスが1のデータに対してLFAを行う

# df1_missing=df1_missing.fillna(0) #欠損値を０で穴埋め
ar_clust1=df_clust1.values #データフレームをarray型に変換
mf = MatrixFactorization(ar_clust1,k=30) #array型のデータ,行数,列数,LFA次元,
U,V=mf.fit()
df_U1=pd.DataFrame(U)
df_V1=pd.DataFrame(V)

In [127]:
df_U1

Unnamed: 0,0,1,2,3,4,...,25,26,27,28,29
0,0.079686,0.033289,0.492317,0.588064,0.541874,...,-0.044212,0.217256,0.557216,0.167111,-0.177194
1,0.628211,0.337055,0.145349,0.031329,0.688504,...,0.011696,-0.177089,0.521395,0.152235,0.503228
2,0.143063,0.726400,-0.209496,0.272020,0.295585,...,0.551448,0.373115,-0.288091,-0.097305,0.042125
3,-0.098091,0.033122,-0.251294,0.550607,0.816504,...,0.228577,0.548557,0.123104,0.201430,0.422675
4,-0.169169,0.390187,-0.016114,0.477682,0.373812,...,0.215352,-0.064319,0.401230,0.346571,0.219690
...,...,...,...,...,...,...,...,...,...,...,...
45,-0.158372,0.978210,0.045880,0.211159,0.051771,...,0.167194,0.478479,0.087263,0.828594,0.255207
46,0.361603,0.147831,0.679355,-0.036157,0.492776,...,0.263498,-0.017070,0.175161,-0.367879,1.010409
47,0.103218,0.485067,-0.070165,0.226946,0.533533,...,0.163701,-0.270376,-0.047583,0.222096,0.019902
48,0.506778,0.808827,-0.001230,-0.128204,-0.414561,...,0.580135,0.469881,0.556829,-0.173415,0.434061


In [78]:
df_V1

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
0,0.852024,-0.172349,0.066924,0.481262,-0.062720,...,0.458836,0.257984,0.569142,0.590319,0.555624
1,0.576396,0.740439,0.324386,-0.036068,-0.100460,...,0.600651,-0.015471,0.915613,0.677277,0.922684
2,0.273946,0.071584,0.773985,0.406975,-0.113236,...,0.385036,0.131150,0.288410,0.492808,0.336021
3,0.217478,0.424994,0.868672,0.304581,0.410185,...,0.328611,0.219455,1.017378,0.634223,0.342658
4,0.213152,0.279229,0.737280,0.144443,0.197306,...,0.459559,0.415051,0.617319,0.695139,-0.057019
...,...,...,...,...,...,...,...,...,...,...,...
25,0.530172,0.553466,0.774217,0.182299,0.410614,...,0.100725,0.435123,0.636801,0.742553,0.627649
26,0.266963,0.678554,0.516353,0.280104,0.450579,...,0.622449,0.465914,0.505341,0.903103,0.864219
27,0.358606,0.466167,0.067857,0.339411,0.197607,...,0.520550,0.234193,0.588967,0.682257,0.471854
28,0.003060,0.272332,-0.062760,0.638342,-0.055052,...,0.588646,0.555026,0.462007,0.484884,0.546361


In [128]:
ar_U1=df_U1.values
ar_V1=df_V1.values
R_hat1=pd.DataFrame(np.dot(ar_U1,ar_V1))
R_hat1.index=df_clust1.index
R_hat1

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,100
0,1.642146,4.567807,2.209979,2.846447,2.173229,...,5.101649,4.604472,1.624236,3.866327,1.079061
5,0.973382,3.386863,2.291680,3.033658,1.710169,...,4.809698,4.619391,1.432666,4.233173,1.008200
6,2.064236,4.338539,2.754747,2.302293,0.518131,...,4.848278,5.196579,0.878192,3.517165,1.015861
10,1.742176,3.413159,1.822352,3.414630,1.438969,...,4.436443,4.706503,0.973544,3.517785,0.997837
12,1.438501,4.522877,2.479079,2.554054,1.405479,...,4.397832,5.400191,-0.485218,3.795331,0.982063
...,...,...,...,...,...,...,...,...,...,...,...
89,2.272070,3.653214,2.157445,3.404059,2.025798,...,4.953016,4.794082,0.282970,3.390478,1.016802
91,0.849893,3.685934,1.686930,2.639942,1.063314,...,4.354237,5.315849,1.973649,4.004996,1.039892
92,2.257864,4.611080,3.321541,2.904839,0.650647,...,4.823932,5.112832,0.247735,3.725949,1.010925
96,2.273716,3.840665,1.802936,2.659612,1.196786,...,4.707952,4.905870,1.231113,4.008528,0.960266


## クラスター2に対してLFA

In [129]:
df_clust2=df_clust2.drop("clust_label",axis=1)#ラベルを落とす
df_clust2

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
1,0.000000,3.797731,0.000000,3.508350,1.906513,...,0.000000,0.000000,0.000000,5.016462,3.607476
2,0.000000,4.642896,3.866896,3.578688,0.000000,...,2.237827,0.000000,0.000000,5.972514,3.750053
3,2.602926,3.813785,0.000000,0.000000,0.000000,...,2.276721,3.273679,5.073993,0.000000,0.000000
4,0.000000,0.000000,2.660976,2.935308,2.694547,...,0.000000,2.379576,4.629793,0.000000,3.451964
7,1.709193,2.979889,0.000000,2.905718,0.000000,...,0.000000,2.784138,0.000000,5.013505,4.003592
...,...,...,...,...,...,...,...,...,...,...,...
93,0.000000,3.331255,0.000000,0.000000,0.000000,...,0.000000,0.000000,3.731903,4.748727,0.000000
94,0.000000,0.000000,3.240282,0.000000,2.650623,...,3.313767,0.000000,5.359602,0.000000,0.000000
95,3.654987,3.678589,0.000000,3.505338,1.615027,...,1.827807,3.117743,5.220586,4.745609,0.000000
98,0.000000,3.144206,0.000000,0.000000,1.328637,...,2.362820,2.133846,0.000000,0.000000,0.000000


In [130]:
#クラスが2のデータに対してLFAを行う

# df1_missing=df1_missing.fillna(0) #欠損値を０で穴埋め
ar_clust2=df_clust2.values #データフレームをarray型に変換
mf = MatrixFactorization(ar_clust2,k=30) #array型のデータ,行数,列数,LFA次元,
U,V=mf.fit()
df_U2=pd.DataFrame(U)
df_V2=pd.DataFrame(V)

In [132]:
df_U2

Unnamed: 0,0,1,2,3,4,...,25,26,27,28,29
0,0.276915,0.063513,0.294051,0.274106,-0.328782,...,0.598729,0.055137,0.596448,0.438151,0.731079
1,1.151027,0.602653,0.345573,-0.027396,0.097234,...,0.016356,0.196667,-0.014737,0.068757,0.612784
2,0.124998,-0.173807,0.700050,-0.026241,0.572438,...,0.266936,0.432804,0.520457,0.721559,-0.430599
3,0.146234,0.320446,0.242571,-0.226430,0.439618,...,0.263302,-0.212002,0.120166,0.226907,0.301189
4,0.149532,0.681303,0.492158,0.263597,-0.203885,...,0.468525,-0.458832,0.565955,0.051897,0.224080
...,...,...,...,...,...,...,...,...,...,...,...
45,0.569860,-0.229343,0.726421,0.641547,0.046865,...,-0.010007,-0.184498,0.072084,-0.098496,0.558546
46,0.313030,0.058071,0.621999,0.044391,0.397682,...,0.227036,0.165553,0.085635,-0.114652,0.118671
47,0.537359,0.506303,-0.158054,0.123534,0.269140,...,0.061851,0.125374,0.621359,0.656192,0.672956
48,0.021787,0.531886,0.878955,0.028482,0.764731,...,0.730662,0.222253,0.468690,0.560460,0.292551


In [83]:
df_V2

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
0,0.447521,0.575443,0.330277,0.448227,-0.244201,...,0.370217,0.436990,0.993854,-0.061010,-0.111398
1,0.250234,0.529699,0.143627,0.745566,-0.158905,...,0.030164,0.558719,0.256132,0.480681,0.319252
2,0.610340,0.319147,0.090187,0.460923,0.411520,...,0.466259,0.877983,0.855342,0.104079,0.675561
3,-0.094671,0.738350,0.064329,0.791248,0.153629,...,-0.032329,0.479518,0.323914,0.653902,0.637544
4,-0.090587,0.741774,0.124554,0.377003,-0.097888,...,0.182522,-0.191538,0.801187,-0.032267,0.507776
...,...,...,...,...,...,...,...,...,...,...,...
25,-0.156123,0.296492,0.280671,0.338991,0.235296,...,-0.248319,0.302253,0.319554,0.437571,0.343047
26,-0.004218,0.316873,0.055309,0.050769,0.140755,...,0.422473,1.071746,0.556079,-0.088825,0.282915
27,0.557415,0.668417,0.553532,0.238398,0.495680,...,0.320424,0.958606,0.661462,0.264281,0.043963
28,0.484719,0.767785,0.496602,0.749228,-0.164420,...,0.179568,0.532052,0.294874,0.541093,0.174879


In [133]:
ar_U2=df_U2.values
ar_V2=df_V2.values
R_hat2=pd.DataFrame(np.dot(ar_U2,ar_V2))
R_hat2.index=df_clust2.index
R_hat2

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
1,3.287765,3.761080,3.461624,3.512891,1.937669,...,2.254739,3.606888,3.772741,5.031587,3.601894
2,2.598200,4.644520,3.839079,3.561535,2.596485,...,2.213937,3.927189,4.703095,5.977712,3.671568
3,2.618725,3.740805,2.591992,2.850372,2.329956,...,2.304150,3.326429,5.097155,5.145039,3.006108
4,1.115020,3.009419,2.667216,2.946292,2.678522,...,1.440612,2.420382,4.611425,5.593023,3.453438
7,1.745999,3.030759,3.969494,2.917716,1.194853,...,2.773033,2.819780,5.022578,4.984422,3.889840
...,...,...,...,...,...,...,...,...,...,...,...
93,2.883495,3.280966,3.308850,3.479576,2.203548,...,2.823518,3.008665,3.797912,4.752530,3.614719
94,1.966903,3.695437,3.227877,2.202155,2.624965,...,3.329883,2.668443,5.346430,3.663674,3.367783
95,3.637137,3.618847,3.731142,3.471723,1.615295,...,1.908612,3.083782,5.194587,4.844050,3.581496
98,3.300287,3.162984,3.197334,3.173961,1.404979,...,2.374366,2.147476,5.263527,3.728928,1.869331


## データを結合

In [134]:
R_hat=pd.concat([R_hat1,R_hat2])
R_hat

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,100
0,1.642146,4.567807,2.209979,2.846447,2.173229,...,5.101649,4.604472,1.624236,3.866327,1.079061
5,0.973382,3.386863,2.291680,3.033658,1.710169,...,4.809698,4.619391,1.432666,4.233173,1.008200
6,2.064236,4.338539,2.754747,2.302293,0.518131,...,4.848278,5.196579,0.878192,3.517165,1.015861
10,1.742176,3.413159,1.822352,3.414630,1.438969,...,4.436443,4.706503,0.973544,3.517785,0.997837
12,1.438501,4.522877,2.479079,2.554054,1.405479,...,4.397832,5.400191,-0.485218,3.795331,0.982063
...,...,...,...,...,...,...,...,...,...,...,...
93,2.883495,3.280966,3.308850,3.479576,2.203548,...,3.008665,3.797912,4.752530,3.614719,
94,1.966903,3.695437,3.227877,2.202155,2.624965,...,2.668443,5.346430,3.663674,3.367783,
95,3.637137,3.618847,3.731142,3.471723,1.615295,...,3.083782,5.194587,4.844050,3.581496,
98,3.300287,3.162984,3.197334,3.173961,1.404979,...,2.147476,5.263527,3.728928,1.869331,


In [136]:
R_hat=R_hat.sort_index() 
R_hat

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,100
0,1.642146,4.567807,2.209979,2.846447,2.173229,...,5.101649,4.604472,1.624236,3.866327,1.079061
1,3.287765,3.761080,3.461624,3.512891,1.937669,...,3.606888,3.772741,5.031587,3.601894,
2,2.598200,4.644520,3.839079,3.561535,2.596485,...,3.927189,4.703095,5.977712,3.671568,
3,2.618725,3.740805,2.591992,2.850372,2.329956,...,3.326429,5.097155,5.145039,3.006108,
4,1.115020,3.009419,2.667216,2.946292,2.678522,...,2.420382,4.611425,5.593023,3.453438,
...,...,...,...,...,...,...,...,...,...,...,...
95,3.637137,3.618847,3.731142,3.471723,1.615295,...,3.083782,5.194587,4.844050,3.581496,
96,2.273716,3.840665,1.802936,2.659612,1.196786,...,4.707952,4.905870,1.231113,4.008528,0.960266
97,2.214206,4.057497,1.998612,3.567397,1.293396,...,4.982110,4.620964,0.395024,3.846624,0.971960
98,3.300287,3.162984,3.197334,3.173961,1.404979,...,2.147476,5.263527,3.728928,1.869331,


## 評価

In [137]:
df_original=pd.read_csv("/Users/kiyopippi/Desktop/卒業研究/データ/20st_data.csv")

In [138]:
df_original=df_original.drop(['Unnamed: 0',"label"], axis=1)
df_original

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
0,1.733201,4.624362,2.448859,2.928745,0.980263,...,0.851529,5.095143,4.169098,0.719596,3.332716
1,2.036672,3.797731,2.652985,3.508350,1.906513,...,2.146242,2.745636,4.170192,5.016462,3.607476
2,3.269514,4.642896,3.866896,3.578688,2.889829,...,2.237827,2.299335,5.014963,5.972514,3.750053
3,2.602926,3.813785,3.747876,3.367899,2.002133,...,2.276721,3.273679,5.073993,5.597362,2.672265
4,3.359984,4.144370,2.660976,2.935308,2.694547,...,2.668287,2.379576,4.629793,4.509458,3.451964
...,...,...,...,...,...,...,...,...,...,...,...
95,3.654987,3.678589,4.249877,3.505338,1.615027,...,1.827807,3.117743,5.220586,4.745609,3.660968
96,2.261376,3.917438,1.814948,2.653100,1.635291,...,1.218367,4.738053,4.942063,1.222991,4.668430
97,2.654306,2.566025,2.244125,4.055539,1.306394,...,1.420774,4.117496,4.587968,0.351126,3.852619
98,2.619232,3.144206,3.201024,4.031691,1.328637,...,2.362820,2.133846,5.062660,5.285915,4.872720


In [139]:
df_missing=df_missing.drop("clust_label", axis=1)
df_missing

Unnamed: 0,0,1,2,3,4,...,95,96,97,98,99
0,0.000000,4.624362,0.000000,0.000000,0.000000,...,0.000000,5.095143,0.000000,0.000000,0.000000
1,0.000000,3.797731,0.000000,3.508350,1.906513,...,0.000000,0.000000,0.000000,5.016462,3.607476
2,0.000000,4.642896,3.866896,3.578688,0.000000,...,2.237827,0.000000,0.000000,5.972514,3.750053
3,2.602926,3.813785,0.000000,0.000000,0.000000,...,2.276721,3.273679,5.073993,0.000000,0.000000
4,0.000000,0.000000,2.660976,2.935308,2.694547,...,0.000000,2.379576,4.629793,0.000000,3.451964
...,...,...,...,...,...,...,...,...,...,...,...
95,3.654987,3.678589,0.000000,3.505338,1.615027,...,1.827807,3.117743,5.220586,4.745609,0.000000
96,2.261376,3.917438,1.814948,2.653100,0.000000,...,0.000000,4.738053,4.942063,1.222991,0.000000
97,0.000000,0.000000,0.000000,0.000000,1.306394,...,1.420774,0.000000,4.587968,0.351126,3.852619
98,0.000000,3.144206,0.000000,0.000000,1.328637,...,2.362820,2.133846,0.000000,0.000000,0.000000


In [140]:
R_hat

Unnamed: 0,0,1,2,3,4,...,96,97,98,99,100
0,1.642146,4.567807,2.209979,2.846447,2.173229,...,5.101649,4.604472,1.624236,3.866327,1.079061
1,3.287765,3.761080,3.461624,3.512891,1.937669,...,3.606888,3.772741,5.031587,3.601894,
2,2.598200,4.644520,3.839079,3.561535,2.596485,...,3.927189,4.703095,5.977712,3.671568,
3,2.618725,3.740805,2.591992,2.850372,2.329956,...,3.326429,5.097155,5.145039,3.006108,
4,1.115020,3.009419,2.667216,2.946292,2.678522,...,2.420382,4.611425,5.593023,3.453438,
...,...,...,...,...,...,...,...,...,...,...,...
95,3.637137,3.618847,3.731142,3.471723,1.615295,...,3.083782,5.194587,4.844050,3.581496,
96,2.273716,3.840665,1.802936,2.659612,1.196786,...,4.707952,4.905870,1.231113,4.008528,0.960266
97,2.214206,4.057497,1.998612,3.567397,1.293396,...,4.982110,4.620964,0.395024,3.846624,0.971960
98,3.300287,3.162984,3.197334,3.173961,1.404979,...,2.147476,5.263527,3.728928,1.869331,


In [141]:
ar_original=df_original.values
ar_missing=df_missing.values
ar_R_hat=R_hat.values
score,count=RMSE2(ar_original,ar_missing,ar_R_hat)
print(score)

0.8034424858293303


In [63]:
print(count)

5000
