> Igor Sorochan DSU-31
## Домашнее задание по теме «Рекомендации на основе содержания»


In [1]:
import os
import numpy as np
import pandas as pd


make_dataset.py, clean.py, build_features.py, train_model.py, predict_model.py и evaluate_model.py

In [2]:
def load_data(dir_path: str) -> tuple:
    """Load data from csv files"""
    os.chdir(path=dir_path)
    os.getcwd()
    links = pd.read_csv("links.csv")
    movies = pd.read_csv("movies.csv")
    ratings = pd.read_csv("ratings.csv")
    tags = pd.read_csv("tags.csv")
    return links, movies, ratings, tags

In [3]:
def clean_data(
    links: pd.DataFrame, movies: pd.DataFrame, ratings: pd.DataFrame, tags: pd.DataFrame
) -> tuple:
    """Clean data"""
    # Remove NaN values
    links = links.dropna()
    movies = movies.dropna()
    ratings = ratings.dropna()
    tags = tags.dropna()
    # Remove duplicates
    links = links.drop_duplicates()
    movies = movies.drop_duplicates()
    ratings = ratings.drop_duplicates()
    tags = tags.drop_duplicates()
    return links, movies, ratings, tags

In [4]:
def build_features(
    links: pd.DataFrame, movies: pd.DataFrame, ratings: pd.DataFrame, tags: pd.DataFrame
) -> tuple:
    """Build features"""
    # Create a new feature 'year' from 'title'
    movies["year"] = movies["title"].str.extract(r"(\d{4})")
    # Create a new feature 'genre' from 'genres'
    movies["genre"] = (
        movies["genres"].str.split("|").apply(lambda lst: " ".join(map(str, lst)))
    )
    movies.drop(columns=["genres"], inplace=True)
    # Create a new feature 'tag' from 'tag'
    tags["tag"] = tags["tag"].str.split(" ").apply(lambda lst: " ".join(map(str, lst)))
    tags.drop(columns=["timestamp"], inplace=True)
    return links, movies, ratings, tags

In [21]:
links, movies, ratings, tags = build_features(
    *clean_data(
        *load_data(
            dir_path="/Users/velo1/SynologyDrive/GIT_syno/data/MovieLens _ml-latest-small"
        )
    )
)

In [26]:
print(f"Ratings shape: {ratings.shape}")
print(f"Movies shape: {movies.shape}")
print(f"Tags shape: {tags.shape}")
print(f"Links shape: {links.shape}")

Ratings shape: (100004, 4)
Movies shape: (9125, 4)
Tags shape: (1296, 3)
Links shape: (9112, 3)


`Left join` movies.csv and ratings.csv on movieId, delete `timestamp` column.

In [29]:
movie_ratings = movies.join(ratings.set_index("movieId"), on="movieId").drop(
    "timestamp", axis=1
).dropna()
movie_ratings.shape

(100001, 6)

In [31]:
# movie_ratings[(movie_ratings["userId"] == 15) & (movie_ratings["movieId"] == 7478)]  

Additional right join with tags.csv on  
`userId + movieId`, delete `timestamp` column.

In [34]:
# tags[(tags["movieId"] == 7478) & (tags["userId"] == 15)]


What user watched and rated, what tags he put on the movie.

In [33]:
# tags.groupby(["userId","movieId"]).agg({'userId':'first',
#                             'tag': lambda lst: ' '.join(map(str, lst)),#.replace('nan',""),
# })

In [32]:
# movie_ratings[movie_ratings["movieId"] == 7478]

In [36]:
joined = movie_ratings.join(
    tags.set_index(["userId","movieId"]), on=["userId","movieId"], how="left", rsuffix="_tag"
)#.drop("tag_tag", axis=1)
# joined = joined.join(
#     tags.set_index(["userId"]), on=["userId"], how="right", rsuffix="_tag")#.drtag", axis=1)
joined

Unnamed: 0,movieId,title,year,genre,userId,rating,tag
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,7.0,3.0,
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,9.0,4.0,
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,13.0,5.0,
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,15.0,2.0,
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,19.0,3.0,
...,...,...,...,...,...,...,...
9116,161918,Sharknado 4: The 4th Awakens (2016),2016,Action Adventure Horror Sci-Fi,624.0,1.5,
9117,161944,The Last Brickmaker in America (2001),2001,Drama,287.0,5.0,
9119,162542,Rustom (2016),2016,Romance Thriller,611.0,5.0,
9120,162672,Mohenjo Daro (2016),2016,Adventure Drama Romance,611.0,3.0,


|films|users|
|---|---|
|movieId|userId|
|title||
|genre|film_genre|
|year|film_year|
|user_rating|film_rating_mean|
|user_tag|film_tag_count|

In [37]:
high_rates = joined[joined['rating']>4]
high_rates.shape

(23064, 7)

In [39]:
user_pref = high_rates.groupby(['userId']).agg({'userId':'first',
                                                     'tag': lambda lst: ' '.join(map(str, lst)).replace('nan',""),
                                                    #  'rating':('mean', 'count'),
                                                     'movieId': lambda lst: ' '.join(map(str, lst)),
                                                    #  'title': lambda lst: ' '.join(map(str, lst)),
                                                     'year': lambda lst: ' '.join(map(str, lst)),
                                                     'genre': lambda lst: ' '.join(map(str, lst))})
# .sort_values(by= ('rating','count'),ascending=False)
print(user_pref.shape)
user_pref

(665, 5)


Unnamed: 0_level_0,userId,tag,movieId,year,genre
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2.0,2.0,,17 39 150 222 265 266 551 585 589 590 592,1995 1995 1995 1995 1992 1994 1993 1995 1991 1...,Drama Romance Comedy Romance Adventure Drama I...
3.0,3.0,,296 318 356 1197 1721 2959 3949 48783 50068,1994 1994 1994 1987 1997 1999 2000 2006 2006,Comedy Crime Drama Thriller Crime Drama Comedy...
4.0,4.0,...,34 112 141 260 296 349 356 357 364 480 541 588...,1995 1995 1996 1977 1994 1994 1994 1994 1994 1...,Children Drama Action Adventure Comedy Crime C...
5.0,5.0,,277 500 597 1035 1380 1485 1784 1923 2081 2694...,1994 1993 1990 1965 1978 1997 1997 1998 1989 1...,Drama Comedy Drama Comedy Romance Musical Roma...
6.0,6.0,,293 1204 1250 1259 1276 1285 2761 5952 7153 8874,1994 1962 1957 1986 1967 1989 1999 2002 2003 2004,Action Crime Drama Thriller Adventure Drama Wa...
...,...,...,...,...,...
667.0,667.0,,32 41 144 272 296 307 345 501 608 745 1148 1199,1995 1995 1995 1994 1994 1993 1994 1993 1996 1...,Mystery Sci-Fi Thriller Drama War Comedy Comed...
668.0,668.0,,296 593 608 1213 1221 2324 2908 2997,1994 1991 1996 1990 1974 1997 1999 1999,Comedy Crime Drama Thriller Crime Horror Thril...
669.0,669.0,,260 913 1304 2959,1977 1941 1969 1999,Action Adventure Sci-Fi Film-Noir Mystery Acti...
670.0,670.0,,25 47 50 318 527 593 608 1183 2683 2759 2912,1995 1995 1995 1994 1993 1991 1996 1996 1999 1...,Drama Romance Mystery Thriller Crime Mystery T...


In [12]:
# user_pref[user_pref['userId','first']==547]['tag','<lambda>']

In [13]:
# user_pref[user_pref['userId','first']==547]['movieId']['<lambda>']

In [20]:
high_rates[(high_rates['userId'] == 547) & ( high_rates['tag'].notnull())]

Unnamed: 0,movieId,title,year,genre,userId,rating,tag
774,954,Mr. Smith Goes to Washington (1939),1939,Drama,547.0,4.5,afi
931,1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),1989,Drama,547.0,5.0,holes80s
2428,3022,"General, The (1926)",1926,Comedy War,547.0,4.5,afi
6241,37741,Capote (2005),2005,Crime Drama,547.0,4.5,toplist05
6249,38304,No Direction Home: Bob Dylan (2005),2005,Documentary,547.0,4.5,toplist05
...,...,...,...,...,...,...,...
8824,128360,The Hateful Eight (2015),2015,Western,547.0,4.5,toplist15
8828,128606,45 Years (2015),2015,Drama,547.0,4.5,toplist15
8902,134130,The Martian (2015),2015,Adventure Drama Sci-Fi,547.0,4.5,toplist15
9046,148626,"Big Short, The (2015)",2015,Drama,547.0,4.5,toplist15


In [47]:
df = user_pref[['tag','movieId','year','genre']]
df

Unnamed: 0_level_0,tag,movieId,year,genre
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2.0,,17 39 150 222 265 266 551 585 589 590 592,1995 1995 1995 1995 1992 1994 1993 1995 1991 1...,Drama Romance Comedy Romance Adventure Drama I...
3.0,,296 318 356 1197 1721 2959 3949 48783 50068,1994 1994 1994 1987 1997 1999 2000 2006 2006,Comedy Crime Drama Thriller Crime Drama Comedy...
4.0,...,34 112 141 260 296 349 356 357 364 480 541 588...,1995 1995 1996 1977 1994 1994 1994 1994 1994 1...,Children Drama Action Adventure Comedy Crime C...
5.0,,277 500 597 1035 1380 1485 1784 1923 2081 2694...,1994 1993 1990 1965 1978 1997 1997 1998 1989 1...,Drama Comedy Drama Comedy Romance Musical Roma...
6.0,,293 1204 1250 1259 1276 1285 2761 5952 7153 8874,1994 1962 1957 1986 1967 1989 1999 2002 2003 2004,Action Crime Drama Thriller Adventure Drama Wa...
...,...,...,...,...
667.0,,32 41 144 272 296 307 345 501 608 745 1148 1199,1995 1995 1995 1994 1994 1993 1994 1993 1996 1...,Mystery Sci-Fi Thriller Drama War Comedy Comed...
668.0,,296 593 608 1213 1221 2324 2908 2997,1994 1991 1996 1990 1974 1997 1999 1999,Comedy Crime Drama Thriller Crime Horror Thril...
669.0,,260 913 1304 2959,1977 1941 1969 1999,Action Adventure Sci-Fi Film-Noir Mystery Acti...
670.0,,25 47 50 318 527 593 608 1183 2683 2759 2912,1995 1995 1995 1994 1993 1991 1996 1996 1999 1...,Drama Romance Mystery Thriller Crime Mystery T...


In [54]:
df.to_numpy().flatten()

array(['          ', '17 39 150 222 265 266 551 585 589 590 592',
       '1995 1995 1995 1995 1992 1994 1993 1995 1991 1990 1989', ...,
       '1 50 260 318 356 551 589 1035 1136 1148 1196 1198 2291 2571 2804 2997 3114 4034 4306 4886 4896 4963 4973 4993 5445 5952 5991',
       '1995 1995 1977 1994 1994 1993 1991 1965 1975 1993 1980 1981 1990 1999 1983 1999 1999 2000 2001 2001 2001 2001 2001 2001 2002 2002 2002',
       'Adventure Animation Children Comedy Fantasy Crime Mystery Thriller Action Adventure Sci-Fi Crime Drama Comedy Drama Romance War Animation Children Fantasy Musical Action Sci-Fi Musical Romance Adventure Comedy Fantasy Animation Children Comedy Crime Action Adventure Sci-Fi Action Adventure Drama Fantasy Romance Action Sci-Fi Thriller Children Comedy Comedy Drama Fantasy Adventure Animation Children Comedy Fantasy Crime Drama Thriller Adventure Animation Children Comedy Fantasy Romance Adventure Animation Children Comedy Fantasy Adventure Children Fantasy Crime Thriller 

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english'
                        , max_features=10000)
X_train_tfidf_tag = tfidf.fit_transform(df.to_numpy().flatten())
                                            
X_train_tfidf_tag

<2660x4435 sparse matrix of type '<class 'numpy.float64'>'
	with 42621 stored elements in Compressed Sparse Row format>

In [69]:
neigh_tag = NearestNeighbors(n_neighbors=7, metric='euclidean') 
neigh_tag.fit(X_train_tfidf_tag)

In [60]:
pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,10,100,1000,100106,100383,1005,100553,100556,100714,100843,...,war,warmerdam,washington,weird,western,whimsical,world,wrongful,york,zombies
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.113562,0.0,0.0,0.0,0.289537,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.212013,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


Is input for inference we should get a list of films and tags.

In [104]:
to_predict = tfidf.transform(['pixar'])
res = neigh_tag.kneighbors(to_predict, return_distance=True)
index = res[1][0]

movies.loc[index]

Unnamed: 0,movieId,title,year,genre
8,9,Sudden Death (1995),1995,Action
4,5,Father of the Bride Part II (1995),1995,Comedy
16,17,Sense and Sensibility (1995),1995,Drama Romance
12,13,Balto (1995),1995,Adventure Animation Children
20,21,Get Shorty (1995),1995,Comedy Crime Thriller
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy
24,25,Leaving Las Vegas (1995),1995,Drama Romance
