In [30]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the basic information about the dataset
u_info = pd.read_csv('../data/raw/ml-100k/u.info', sep=' ', names=['count', 'type'])

u_info

Unnamed: 0,count,type
0,943,users
1,1682,items
2,100000,ratings


In [37]:
# Displaying the list of genres and their codes
u_genre = pd.read_csv('../data/raw/ml-100k/u.genre', sep='|', names=['genre', 'code'])
u_genre

Unnamed: 0,genre,code
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [39]:
# Displaying the list of occupations
u_occupation = pd.read_csv('../data/raw/ml-100k/u.occupation')
print(u_occupation)

    administrator
0          artist
1          doctor
2        educator
3        engineer
4   entertainment
5       executive
6      healthcare
7       homemaker
8          lawyer
9       librarian
10      marketing
11           none
12          other
13     programmer
14        retired
15       salesman
16      scientist
17        student
18     technician
19         writer


# Invistigating the Data file

In [40]:
# Read the data files
u_base = pd.read_csv('../data/raw/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
print(u_base.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [41]:
u_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [42]:
# Check if there is null values
u_base.isnull().sum()

user_id      0
item_id      0
rating       0
timestamp    0
dtype: int64

# Invistigating the Items Data

In [25]:
columns_u_item = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB_URL',
                  'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
                  'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                  'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

u_item = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', names=columns_u_item, encoding='latin-1')
u_item.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDB_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
u_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   int64  
 1   title               1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   IMDB_URL            1679 non-null   object 
 5   unknown             1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children            1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null   int64  
 16  Horror

In [27]:
# Converting the release_date column to datetime
u_item['release_date'] = pd.to_datetime(u_item['release_date'])

# Print the type of the release_date column
print(u_item['release_date'].dtype)

datetime64[ns]


In [28]:
# Print the shape of the items dataset and the number of null values in each column
print(u_item.shape)
print(u_item.isnull().sum())

(1682, 24)
movie_id                 0
title                    0
release_date             1
video_release_date    1682
IMDB_URL                 3
unknown                  0
Action                   0
Adventure                0
Animation                0
Children                 0
Comedy                   0
Crime                    0
Documentary              0
Drama                    0
Fantasy                  0
Film-Noir                0
Horror                   0
Musical                  0
Mystery                  0
Romance                  0
Sci-Fi                   0
Thriller                 0
War                      0
Western                  0
dtype: int64


In [29]:
# Drop the video release date column since it has no values
u_item.drop('video_release_date', axis=1, inplace=True)

# Fill the null values in the release date column with the mode
u_item['release_date'].fillna(u_item['release_date'].mean(), inplace=True)

# Replace the Nan values in the IMDB URL column with an empty string
u_item['IMDB_URL'].fillna('', inplace=True)

# Making sure there is no null values remaining
print("Null Values count: ", u_item.isnull().sum().sum())

Null Values count:  0


# Invistigating the Users Data

In [34]:
columns_u_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
u_user = pd.read_csv('../data/raw/ml-100k/u.user', sep='|', names=columns_u_user)
u_user.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [35]:
u_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   gender      943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB


In [36]:
# Checking for null values
u_user.isnull().sum()

user_id       0
age           0
gender        0
occupation    0
zip_code      0
dtype: int64