In [1]:
# import dependencies
import pandas as pd
import nltk

# will need to uncomment the following the first time you run
# nltk.download('punkt')

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
r_cols = ["user id", "item id", "rating", "timestamp"]
ratings = pd.read_csv('../ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')
ratings.head()

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
u_cols = ["user id", "age", "gender", "occupation", "zip code"]
users = pd.read_csv('../ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')
users.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
users["occupation"].describe()

count         943
unique         21
top       student
freq          196
Name: occupation, dtype: object

In [5]:
# One Hot Encoding
gender = pd.get_dummies(users["gender"])

# merge back to original df
users = pd.concat([users,gender], axis=1)

# should
users.drop(['gender'],axis=1, inplace=True)
# 
users.head()

Unnamed: 0,user id,age,occupation,zip code,F,M
0,1,24,technician,85711,0,1
1,2,53,other,94043,1,0
2,3,23,writer,32067,0,1
3,4,24,technician,43537,0,1
4,5,33,other,15213,1,0


In [None]:
i_cols = ["item id", "movie title", "release date", "video release date", "IMDb URL", "unknown", "action", "adventure", "animation", "children's", "comedy", "crime", "documentary", "drama", "fantasy", "film-noir", "horror", "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western"]
items = pd.read_csv('../ml-100k/u.item', sep='|', names=i_cols,encoding='latin-1')
items.head()

In [None]:
# Make each user id a unique row?

# merge movies and ratings
df = pd.merge(ratings, movies, on="movieId")
df.sort_values("userId").head()

In [None]:
df['year'] = df['title'].str.extract('.*\((.*)\).*')
df.head()

In [None]:
# prep the year column to convert to int
df = df.replace(to_replace='2006–2007', value='2007')
df = df.fillna(0)
df.head()

In [None]:
# split the genres out from the pipe delimeters and extract year from title into new column
df['genres'] = df['genres'].str.lower()
df['genres'] = df['genres'].str.split("|")

In [None]:
df['genres']=[" ".join(word) for word in df['genres'].values]

In [None]:
v = TfidfVectorizer()
x = v.fit_transform(df['genres'])
df['genresVect'] = list(x.toarray())
df.head()

In [None]:
# convert the tags or the genres to numerical using hashing?

In [None]:
# convert to int
df['rating'] = df['rating'].astype(int) 
target = df["rating"]

In [None]:
df['userId'] = df['userId'].astype(int) 
df['movieId'] = df['movieId'].astype(int) 
df['year'] = df['year'].astype(int) 

In [None]:
# data = df[["userId", "movieId", "year", "genresVect"]]
data = df[["genresVect"]]
feature_names = data.columns
data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
importances = rf.feature_importances_
importances

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
sns.barplot(x=importances, y=feature_names)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Features by Importance")
plt.legend()
plt.show()