> Igor Sorochan DSU-31
## Домашнее задание по теме «Рекомендации на основе содержания»


In [36]:
import os
import numpy as np
import pandas as pd


make_dataset.py, clean.py, build_features.py, train_model.py, predict_model.py и evaluate_model.py

In [37]:
def load_data(dir_path: str) -> tuple:
    """Load data from csv files"""
    os.chdir(path=dir_path)
    os.getcwd()
    links = pd.read_csv("links.csv")
    movies = pd.read_csv("movies.csv")
    ratings = pd.read_csv("ratings.csv")
    tags = pd.read_csv("tags.csv")
    return links, movies, ratings, tags

In [38]:
def clean_data(
    links: pd.DataFrame, movies: pd.DataFrame, ratings: pd.DataFrame, tags: pd.DataFrame
) -> tuple:
    """Clean data"""
    # Remove NaN values
    links = links.dropna()
    movies = movies.dropna()
    ratings = ratings.dropna()
    tags = tags.dropna()
    # Remove duplicates
    links = links.drop_duplicates()
    movies = movies.drop_duplicates()
    ratings = ratings.drop_duplicates()
    tags = tags.drop_duplicates()
    return links, movies, ratings, tags

In [39]:
def build_features(
    links: pd.DataFrame, movies: pd.DataFrame, ratings: pd.DataFrame, tags: pd.DataFrame
) -> tuple:
    """Build features"""
    # Create a new feature 'year' from 'title'
    movies["year"] = movies["title"].str.extract(r"(\d{4})")
    movies["genre"] = (
        movies["genres"]
        .str.split("|")
        .apply(lambda lst: " ".join(lst))
    )
    movies.drop(columns=["genres"], inplace=True)
    # Create a new feature 'tag' from 'tag'
    tags["tag"] = (
        tags["tag"].str.split(" ").apply(lambda lst: " ".join(lst))
    )
    tags.drop(columns=["timestamp"], inplace=True)
    return links, movies, ratings, tags

In [40]:
links, movies, ratings, tags = build_features(
    *clean_data(
        *load_data(
            dir_path="/Users/velo1/SynologyDrive/GIT_syno/data/MovieLens _ml-latest-small"
        )
    )
)

In [41]:
print(f"Ratings shape: {ratings.shape}")
print(f"Movies shape: {movies.shape}")
print(f"Tags shape: {tags.shape}")
print(f"Links shape: {links.shape}")

Ratings shape: (100004, 4)
Movies shape: (9125, 4)
Tags shape: (1296, 3)
Links shape: (9112, 3)


`Left join` movies.csv and ratings.csv on movieId, delete `timestamp` column.

In [42]:
movie_ratings = movies.join(ratings.set_index("movieId"), on="movieId").drop(
    "timestamp", axis=1
).dropna()
movie_ratings.shape

(100001, 6)

In [43]:
joined = movie_ratings.join(
    tags.set_index(["movieId"]), on=["movieId"], how="left", rsuffix="_tag"
)#.drop(["userId_tag","userId", "title"], axis=1)

joined

Unnamed: 0,movieId,title,year,genre,userId,rating,userId_tag,tag
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,7.0,3.0,501.0,Pixar
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,9.0,4.0,501.0,Pixar
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,13.0,5.0,501.0,Pixar
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,15.0,2.0,501.0,Pixar
0,1,Toy Story (1995),1995,Adventure Animation Children Comedy Fantasy,19.0,3.0,501.0,Pixar
...,...,...,...,...,...,...,...,...
9116,161918,Sharknado 4: The 4th Awakens (2016),2016,Action Adventure Horror Sci-Fi,624.0,1.5,,
9117,161944,The Last Brickmaker in America (2001),2001,Drama,287.0,5.0,,
9119,162542,Rustom (2016),2016,Romance Thriller,611.0,5.0,,
9120,162672,Mohenjo Daro (2016),2016,Adventure Drama Romance,611.0,3.0,,


In [44]:
movies_grouped = joined.groupby(['movieId']).agg({
    # list of aggregations per column (whitespace separated unique strings)
    'tag': lambda lst: list(set(' '.join(map(str, lst)).replace('nan',"").split(" "))),
    'rating':('mean', 'count'),
    # retrieve the first value of the groupby object
    'title': 'first',
    'year': 'first',
    # list of aggregations per column (whitespace separated unique strings)
    "genre": lambda lst: list(set(' '.join(map(str, lst)).split(" ")))
    }).sort_values(by= ('rating','count'),ascending=False)
print(movies_grouped.shape)
# set new column names
movies_grouped.columns = ['tag', 'rating_mean', 'rating_count', 'title', 'year', 'genre']
# convert year to int
movies_grouped['year'] = movies_grouped['year'].astype(int)
# convert genre list to string
movies_grouped["genre"] = movies_grouped["genre"].apply(lambda x: ' '.join(x) )
# convert tag list to string
movies_grouped["tag"] = movies_grouped["tag"].apply(lambda x: ' '.join(x) )

movies_grouped.head(10)

(9063, 6)


Unnamed: 0_level_0,tag,rating_mean,rating_count,title,year,genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
260,supernatural Lucas nerdy sci-fi powers starwar...,4.221649,7275,Star Wars: Episode IV - A New Hope (1977),1977,Sci-Fi Adventure Action
318,imprisonment Tim Robbins escape Phenomenal! re...,4.487138,2799,"Shawshank Redemption, The (1994)",1994,Crime Drama
1210,Star starship war action Ford sequel Lucas pil...,4.059908,2170,Star Wars: Episode VI - Return of the Jedi (1983),1983,Sci-Fi Adventure Action
47,crime killer horror serial disturbing biblical...,4.034826,2010,Seven (a.k.a. Se7en) (1995),1995,Thriller Mystery
1265,Murray love self comedy reality funny alternat...,3.839394,1650,Groundhog Day (1993),1993,Romance Comedy Fantasy
296,Quentin comedy tarantino Tarantino dark r:viol...,4.256173,1620,Pulp Fiction (1994),1994,Crime Comedy Drama Thriller
4973,love comedy drama notable soundtrack filmed qu...,4.096,1500,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",2001,Romance Comedy
6539,I magic five it it! depp sword comedy pirates ...,3.85461,1410,Pirates of the Caribbean: The Curse of the Bla...,2003,Adventure Fantasy Comedy Action
1732,dialogue Frontal) Bridges comedy (Full Steve N...,3.995833,1200,"Big Lebowski, The (1998)",1998,Crime Comedy
1258,Kubrick Frontal) (Full nicholson Nudity film c...,4.029703,1111,"Shining, The (1980)",1980,Horror


In [45]:
X = movies_grouped[['tag', 'rating_mean', 'rating_count', 'year', 'genre']]
X

Unnamed: 0_level_0,tag,rating_mean,rating_count,year,genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
260,supernatural Lucas nerdy sci-fi powers starwar...,4.221649,7275,1977,Sci-Fi Adventure Action
318,imprisonment Tim Robbins escape Phenomenal! re...,4.487138,2799,1994,Crime Drama
1210,Star starship war action Ford sequel Lucas pil...,4.059908,2170,1983,Sci-Fi Adventure Action
47,crime killer horror serial disturbing biblical...,4.034826,2010,1995,Thriller Mystery
1265,Murray love self comedy reality funny alternat...,3.839394,1650,1993,Romance Comedy Fantasy
...,...,...,...,...,...
5356,,0.500000,1,1975,Sci-Fi Horror
54220,,4.000000,1,1969,Drama Comedy
54251,,5.000000,1,2004,Comedy
5353,,2.000000,1,1972,Drama Comedy


In [46]:
# data = pd.get_dummies(X, dtype=int)

In [47]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error


In [87]:
# lr = ridge_regression()
tfidf = TfidfVectorizer()
data = tfidf.fit_transform(X[['tag','genre']].to_numpy().flatten())
data = pd.DataFrame(data.toarray(), columns=tfidf.get_feature_names_out())
data = data.join(X[['rating_mean', 'rating_count', 'year']])
data

Unnamed: 0,1940,80,abigail,acclaimed,accurate,acting,action,activist,adam,adaptation,...,wrongful,ww2,york,you,your,zombies,zooey,rating_mean,rating_count,year
0,0.0,0.0,0.0,0.171721,0.0,0.0,0.058754,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.445943,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,3.872470,247.0,1995.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.293721,0.0,0.0,0.0,0.0,0.0,0.0,3.401869,107.0,1995.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,3.161017,59.0,1995.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.103094,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,2.384615,13.0,1995.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18121,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,,,
18122,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,,,
18123,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,,,
18124,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("rating_mean", axis=1),
    data["rating_mean"],
    test_size=0.2,
    random_state=42,
)
rr = Ridge(alpha=5)
rr

In [73]:
rr.fit(X_train, y_train)
y_pred = rr.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")
print(f"LinearRegression score: {rr.score(X_test, y_test)}")

RMSE: 0.8706675764124412
LinearRegression score: 0.0615614908181662


In [74]:
rr.coef_

array([ 0.00097808, -0.00100365, -0.30090198, ..., -0.45885132,
       -0.09266944,  0.31766443])

In [75]:
results = pd.DataFrame({'y_pred':y_pred, 'y_test':y_test})
results['delta'] = results['y_pred'] - results['y_test']
results.sort_values(by='delta', ascending=False)

Unnamed: 0_level_0,y_pred,y_test,delta
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
260,10.405806,4.221649,6.184157
53038,3.600544,0.500000,3.100544
60990,3.571438,0.500000,3.071438
7204,3.565416,0.500000,3.065416
26188,3.564412,0.500000,3.064412
...,...,...,...
2650,2.878020,5.000000,-2.121980
7564,2.855940,5.000000,-2.144060
3021,2.838878,5.000000,-2.161122
1771,2.822819,5.000000,-2.177181


In [76]:
# user_pref[user_pref['userId','first']==547]['tag','<lambda>']

In [77]:
# user_pref[user_pref['userId','first']==547]['movieId']['<lambda>']

In [78]:
# high_rates[(high_rates['userId'] == 547) & ( high_rates['tag'].notnull())]

In [79]:
df = user_pref[['tag','movieId','year','genre','ratings']]
df

NameError: name 'user_pref' is not defined

In [None]:
df.to_numpy().flatten()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english'
                        , max_features=10000)
X_train_tfidf_tag = tfidf.fit_transform(df.to_numpy().flatten())
                                            
X_train_tfidf_tag

In [None]:
neigh_tag = NearestNeighbors(n_neighbors=7, metric='euclidean') 
neigh_tag.fit(X_train_tfidf_tag)

In [None]:
pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf.get_feature_names_out())

Is input for inference we should get a list of films and tags.

In [None]:
to_predict = tfidf.transform(['pixar'])
res = neigh_tag.kneighbors(to_predict, return_distance=True)
index = res[1][0]

movies.loc[index]