In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD
from IPython.display import display, Image

In [3]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('C:\\Users\\user\\ict_class\\ml-100k\\u.data', sep='\t', names=columns)
print(df.shape)
df.head()

(100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
# 장르 분야
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
           'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 
           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
           'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')

movie_names = movies[['item_id', 'movie title']]
c_movies_data = pd.merge(df, movie_names, on='item_id')
print(c_movies_data.shape)
c_movies_data.head()

(100000, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [5]:
rating_crosstab = c_movies_data.pivot_table(values='rating', 
                                            index='user_id', 
                                            columns='movie title', fill_value=0)
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [6]:
X = rating_crosstab.T
print(X.shape)

(1664, 943)


In [7]:
SVD = TruncatedSVD(n_components=12, random_state=5)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape

(1664, 12)

In [8]:
### correlation matrix
corr_mat = np.corrcoef(resultant_matrix)
print( corr_mat.shape )
corr_mat

(1664, 1664)


array([[ 1.        , -0.11677136,  0.50912612, ...,  0.38185536,
         0.1925393 ,  0.49767897],
       [-0.11677136,  1.        ,  0.05531997, ...,  0.16334854,
         0.52127277,  0.28134103],
       [ 0.50912612,  0.05531997,  1.        , ...,  0.76371825,
         0.43466401,  0.19050485],
       ...,
       [ 0.38185536,  0.16334854,  0.76371825, ...,  1.        ,
         0.18096315,  0.12145833],
       [ 0.1925393 ,  0.52127277,  0.43466401, ...,  0.18096315,
         1.        ,  0.20183785],
       [ 0.49767897,  0.28134103,  0.19050485, ...,  0.12145833,
         0.20183785,  1.        ]])

In [9]:
rating_crosstab.columns.get_loc("Star Wars (1977)")

1398

In [10]:
col_idx = rating_crosstab.columns.get_loc("Star Wars (1977)")
corr_specific = corr_mat[col_idx]    # Star Wars (1977)의 위치 행 획득
print(corr_specific.shape) 

(1664,)


In [11]:
result = pd.DataFrame({'corr_specific':corr_specific, 'Movies': rating_crosstab.columns})
print(result.shape)
result.head()

(1664, 2)


Unnamed: 0,corr_specific,Movies
0,0.352088,'Til There Was You (1997)
1,0.419504,1-900 (1994)
2,0.594859,101 Dalmatians (1996)
3,0.720699,12 Angry Men (1957)
4,0.325237,187 (1997)


In [12]:
result.sort_values('corr_specific', ascending=False).head(10)

Unnamed: 0,corr_specific,Movies
1398,1.0,Star Wars (1977)
1234,0.98809,Return of the Jedi (1983)
1460,0.942415,Terminator 2: Judgment Day (1991)
1523,0.932997,Toy Story (1995)
1461,0.931761,"Terminator, The (1984)"
1205,0.92504,Raiders of the Lost Ark (1981)
456,0.923516,"Empire Strikes Back, The (1980)"
570,0.915802,"Fugitive, The (1993)"
414,0.914633,Die Hard (1988)
44,0.89327,Aliens (1986)


In [13]:
rating_crosstab.columns.get_loc("Young Guns II (1990)")

1659

In [14]:
col_idx_y = rating_crosstab.columns.get_loc("Young Guns II (1990)")
corr_specific_y = corr_mat[col_idx_y]    # Star Wars (1977)의 위치 행 획득
print(corr_specific_y.shape) 

(1664,)


In [15]:
result_y = pd.DataFrame({'corr_specific':corr_specific_y, 'Movies': rating_crosstab.columns})
print(result_y.shape)
result_y.head()

(1664, 2)


Unnamed: 0,corr_specific,Movies
0,0.449675,'Til There Was You (1997)
1,0.226579,1-900 (1994)
2,0.564496,101 Dalmatians (1996)
3,0.411637,12 Angry Men (1957)
4,0.365342,187 (1997)


In [17]:
result_y.sort_values('corr_specific', ascending=False).head(20)

Unnamed: 0,corr_specific,Movies
1659,1.0,Young Guns II (1990)
1658,0.968831,Young Guns (1988)
355,0.959225,"Crow, The (1994)"
406,0.947714,Desperado (1995)
17,0.94641,Ace Ventura: Pet Detective (1994)
402,0.933038,Demolition Man (1993)
994,0.931419,Money Train (1995)
1586,0.930088,Virtuosity (1995)
1344,0.929833,Sliver (1993)
593,0.929801,"Getaway, The (1994)"


In [18]:
rating_crosstab.columns.get_loc("Yankee Zulu (1994)")

1654

In [27]:
col_idx_z = rating_crosstab.columns.get_loc("Yankee Zulu (1994)")
corr_specific_z = corr_mat[col_idx_z]
print(corr_specific_z.shape) 

(1664,)


In [28]:
result_z = pd.DataFrame({'corr_specific':corr_specific_z, 'Movies': rating_crosstab.columns})
print(result_z.shape)
result_z.head()

(1664, 2)


Unnamed: 0,corr_specific,Movies
0,0.362138,'Til There Was You (1997)
1,0.229361,1-900 (1994)
2,0.237806,101 Dalmatians (1996)
3,0.375035,12 Angry Men (1957)
4,0.272416,187 (1997)


In [30]:
result_z.sort_values('corr_specific', ascending=False).head(30)

Unnamed: 0,corr_specific,Movies
699,1.0,Hostile Intentions (1994)
1503,1.0,To Cross the Rubicon (1991)
1446,1.0,"Symphonie pastorale, La (1946)"
1522,1.0,Touki Bouki (Journey of the Hyena) (1973)
1311,1.0,Shadows (Cienie) (1988)
1447,1.0,T-Men (1947)
135,1.0,Baton Rouge (1988)
714,1.0,"Hungarian Fairy Tale, A (1987)"
371,1.0,Daens (1992)
1186,1.0,"Promise, The (Versprechen, Das) (1994)"
