<a href="https://colab.research.google.com/github/Harry-Turner/Python/blob/main/ML_UserProfiles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas

ratings_features = ["userId", "movieId", "rating", "timestamp"]
ratings_dataframe = pandas.read_csv("u.data", sep = "\t", names = ratings_features,
                                    encoding = "ISO-8859-1")


In [2]:
movie_features = ["movieId", "movieTitle", "Release", "videoRelease", "IMDB URL", "unknown",
                  "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime",
                  "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
                  "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

movies_dataframe = pandas.read_csv("u.item", sep = "|", names = movie_features,
                                    encoding = "ISO-8859-1")

movies_dataframe

Unnamed: 0,movieId,movieTitle,Release,videoRelease,IMDB URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
user_features = ["userId", "age", "gender", "occupation", "zipCode"]
users_dataframe = pandas.read_csv("u.user", sep = "|", names = user_features,
                                    encoding = "ISO-8859-1")

users_dataframe

Unnamed: 0,userId,age,gender,occupation,zipCode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [4]:
NUMBER_OF_GROUPS = 5

users_dataframe["age group"] = pandas.qcut(users_dataframe["age"],
                                           q = NUMBER_OF_GROUPS,
                                           precision = 0)

users_dataframe

Unnamed: 0,userId,age,gender,occupation,zipCode,age group
0,1,24,M,technician,85711,"(23.0, 29.0]"
1,2,53,F,other,94043,"(46.0, 73.0]"
2,3,23,M,writer,32067,"(6.0, 23.0]"
3,4,24,M,technician,43537,"(23.0, 29.0]"
4,5,33,F,other,15213,"(29.0, 35.0]"
...,...,...,...,...,...,...
938,939,26,F,student,33319,"(23.0, 29.0]"
939,940,32,M,administrator,02215,"(29.0, 35.0]"
940,941,20,M,student,97229,"(6.0, 23.0]"
941,942,48,F,librarian,78209,"(46.0, 73.0]"


In [11]:
merged_dataframes = pandas.merge(pandas.merge(ratings_dataframe,
                                              users_dataframe[["userId",
                                                               "age group",
                                                               "gender",
                                                               "occupation"]],
                                              on = "userId",
                                              how = "left"),
                                 movies_dataframe,
                                 on = "movieId",
                                 how = "left")
merged_dataframes

Unnamed: 0,userId,movieId,rating,timestamp,age group,gender,occupation,movieTitle,Release,videoRelease,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,"(46.0, 73.0]",M,writer,Kolya (1996),24-Jan-1997,,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,"(35.0, 46.0]",F,executive,L.A. Confidential (1997),01-Jan-1997,,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,"(23.0, 29.0]",M,writer,Heavyweights (1994),01-Jan-1994,,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,"(23.0, 29.0]",M,technician,Legends of the Fall (1994),01-Jan-1994,,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,"(46.0, 73.0]",M,educator,Jackie Brown (1997),01-Jan-1997,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,476,3,880175444,"(6.0, 23.0]",M,student,"First Wives Club, The (1996)",14-Sep-1996,,...,0,0,0,0,0,0,0,0,0,0
99996,716,204,5,879795543,"(35.0, 46.0]",F,administrator,Back to the Future (1985),01-Jan-1985,,...,0,0,0,0,0,0,1,0,0,0
99997,276,1090,1,874795795,"(6.0, 23.0]",M,student,Sliver (1993),01-Jan-1993,,...,0,0,0,0,0,0,0,1,0,0
99998,13,225,2,882399156,"(46.0, 73.0]",M,educator,101 Dalmatians (1996),27-Nov-1996,,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Drop columns that are not useful for ML model. Extra info thats not needed can / will affect results
merged_dataframes.drop(["movieId", "movieTitle", "Release", "timestamp", "unknown", "IMDB URL", "videoRelease"], axis = 1,
                       inplace = True)

merged_dataframes

Unnamed: 0,userId,rating,age group,gender,occupation,Action,Adventure,Animation,Childrens,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,3,"(46.0, 73.0]",M,writer,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,186,3,"(35.0, 46.0]",F,executive,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,22,1,"(23.0, 29.0]",M,writer,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,244,2,"(23.0, 29.0]",M,technician,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,166,1,"(46.0, 73.0]",M,educator,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,3,"(6.0, 23.0]",M,student,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
99996,716,5,"(35.0, 46.0]",F,administrator,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
99997,276,1,"(6.0, 23.0]",M,student,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,13,2,"(46.0, 73.0]",M,educator,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
merged_dataframes ["age group"] = pandas.Categorical(merged_dataframes["age group"])

merged_dataframes

Unnamed: 0,userId,rating,age group,gender,occupation,Action,Adventure,Animation,Childrens,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,3,"(46.0, 73.0]",M,writer,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,186,3,"(35.0, 46.0]",F,executive,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2,22,1,"(23.0, 29.0]",M,writer,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,244,2,"(23.0, 29.0]",M,technician,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,166,1,"(46.0, 73.0]",M,educator,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,3,"(6.0, 23.0]",M,student,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
99996,716,5,"(35.0, 46.0]",F,administrator,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
99997,276,1,"(6.0, 23.0]",M,student,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,13,2,"(46.0, 73.0]",M,educator,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [14]:
age_group_dummies = pandas.get_dummies(merged_dataframes["age group"])

age_group_dummies

Unnamed: 0,"(6.0, 23.0]","(23.0, 29.0]","(29.0, 35.0]","(35.0, 46.0]","(46.0, 73.0]"
0,0,0,0,0,1
1,0,0,0,1,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,0,0,0,1
...,...,...,...,...,...
99995,1,0,0,0,0
99996,0,0,0,1,0
99997,1,0,0,0,0
99998,0,0,0,0,1


In [16]:
merged_dataframes["gender"] = pandas.Categorical(merged_dataframes["gender"])

gender_dummies = pandas.get_dummies(merged_dataframes["gender"])

gender_dummies

Unnamed: 0,F,M
0,0,1
1,1,0
2,0,1
3,0,1
4,0,1
...,...,...
99995,0,1
99996,1,0
99997,0,1
99998,0,1


In [22]:
merged_dataframes["occupation"] = pandas.Categorical(merged_dataframes["occupation"])

occupation_dummies = pandas.get_dummies(merged_dataframes["occupation"])

occupation_dummies

Unnamed: 0,administrator,artist,doctor,educator,engineer,entertainment,executive,healthcare,homemaker,lawyer,...,marketing,none,other,programmer,retired,salesman,scientist,student,technician,writer
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
merged_dataframes = pandas.concat([merged_dataframes,
                                   age_group_dummies,
                                   gender_dummies,
                                   occupation_dummies],
                                  axis = 1)

merged_dataframes

Unnamed: 0,userId,rating,age group,gender,occupation,Action,Adventure,Animation,Childrens,Comedy,...,marketing,none,other,programmer,retired,salesman,scientist,student,technician,writer
0,196,3,"(46.0, 73.0]",M,writer,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,186,3,"(35.0, 46.0]",F,executive,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,1,"(23.0, 29.0]",M,writer,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
3,244,2,"(23.0, 29.0]",M,technician,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,166,1,"(46.0, 73.0]",M,educator,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,3,"(6.0, 23.0]",M,student,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
99996,716,5,"(35.0, 46.0]",F,administrator,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
99997,276,1,"(6.0, 23.0]",M,student,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,13,2,"(46.0, 73.0]",M,educator,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [25]:
merged_dataframes.drop(["age group", "gender", "occupation"], axis = 1,
                       inplace = True)

merged_dataframes

Unnamed: 0,userId,rating,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,...,marketing,none,other,programmer,retired,salesman,scientist,student,technician,writer
0,196,3,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,186,3,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,244,2,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,166,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,3,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99996,716,5,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,276,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,13,2,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Check to see if there are any null values as they're not required
merged_dataframes.isnull().sum()

userId           0
rating           0
Action           0
Adventure        0
Animation        0
Childrens        0
Comedy           0
Crime            0
Documentary      0
Drama            0
Fantasy          0
Film-Noir        0
Horror           0
Musical          0
Mystery          0
Romance          0
Sci-Fi           0
Thriller         0
War              0
Western          0
(6.0, 23.0]      0
(23.0, 29.0]     0
(29.0, 35.0]     0
(35.0, 46.0]     0
(46.0, 73.0]     0
F                0
M                0
administrator    0
artist           0
doctor           0
educator         0
engineer         0
entertainment    0
executive        0
healthcare       0
homemaker        0
lawyer           0
librarian        0
marketing        0
none             0
other            0
programmer       0
retired          0
salesman         0
scientist        0
student          0
technician       0
writer           0
dtype: int64