# Data Preprocessing 

# Data Preprocessing on given csv files from the MovieLens 25m dataset (https://grouplens.org/datasets/movielens/25m/):

In [1]:
import pandas as pd

In [1]:
import pickle

In [2]:
movies_data = pd.read_csv(r"movies.csv")
print(movies_data.shape)
movies_data.info()
movies_data.head(5)

(62423, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_data = pd.read_csv(r"ratings.csv")
print(ratings_data.shape)
ratings_data.info()
ratings_data.head(5)

(25000095, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
tags = pd.read_csv(r"tags.csv")
print(tags.shape)
tags.info()
tags.head(5)

(1093360, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093360 entries, 0 to 1093359
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userId     1093360 non-null  int64 
 1   movieId    1093360 non-null  int64 
 2   tag        1093344 non-null  object
 3   timestamp  1093360 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 33.4+ MB


Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [6]:
links = pd.read_csv(r"links.csv")
print(links.shape)
links.info()
links.head(5)

(62423, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  62423 non-null  int64  
 1   imdbId   62423 non-null  int64  
 2   tmdbId   62316 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.4 MB


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
genome_tags = pd.read_csv(r"genome-tags.csv")
print(genome_tags.shape)
genome_tags.info()
genome_tags.head(5)

(1128, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [9]:
genome_scores = pd.read_csv(r"genome-scores.csv")
print(genome_scores.shape)
genome_scores.info()
genome_scores.head(5)

(15584448, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15584448 entries, 0 to 15584447
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 356.7 MB


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


# DataFrames Manipulations

In [None]:
movies_data = movies_data.sort_values(by=['title'])
ratings_data['movieId']=ratings_data.movieId.astype('category')

In [None]:
data = pd.merge(movies_data,ratings_data.groupby('movieId').rating.mean(),on="movieId")
data = pd.merge(data,ratings_data.groupby("movieId").userId.count(),on="movieId")
data.rename(columns={'rating':'avg_rating'},inplace=True)
data.rename(columns={'userId':'num_ratings'},inplace=True)

In [None]:
links.drop(['imdbId'],axis=1,inplace=True)

# Task 3 - Giving top 10 best rated movies - Laplace's rule of succession

### We have a dilemma here, there are a lot of movies that have a perfect 5 rating but only rated by one or few users. While others have a rating like 4.5 but rated by thousands of user. So while suggesting the best rated movies which option should we go for?

### Here comes in the Laplace's rule of succession: Absolute High Rating vs The confidence gained by more data

#### For a movie with perfect 5 rating but only 1 rater. We assume that there are 2 more users who have given a 5 and 1 rating. Now the average rating falls down to (5+5+1)/3 = 3.66. This process is repeated for every other movie and the results according to this new ratings are fetched 

### To our average ratings table we will add one more column at which will calculate the laplace's average:

###  current avg rating + (5+1)/num_ratings

In [57]:
data['lap_avg'] = ((data.avg_rating*data.num_ratings) + 6)/(data.num_ratings+2)

In [58]:
data.head() 

Unnamed: 0,movieId,title,genres,avg_rating,num_ratings,lap_avg
0,208297,"""BLOW THE NIGHT!"" Let's Spend the Night Togeth...",Documentary|Drama,3.0,1,3.0
1,51372,"""Great Performances"" Cats (1998)",Musical,2.896648,179,2.89779
2,136604,#1 Cheerleader Camp (2010),Comedy|Drama,2.111111,9,2.272727
3,183901,#Captured (2017),Horror,3.75,2,3.375
4,195955,#Female Pleasure (2018),Documentary,3.666667,3,3.4


In [39]:
tags.drop(['timestamp'],axis=1,inplace=True)
tags.head()

Unnamed: 0,userId,movieId,tag
0,3,260,classic
1,3,260,sci-fi
2,4,1732,dark comedy
3,4,1732,great dialogue
4,4,7569,so bad it's good


# Pickle the necessary dataframes for future use - App Build and DataBase Creation

In [59]:
pickle.dump(movies_data,open('movies_data.pkl','wb'))
pickle.dump(data,open('avg_ratings.pkl','wb'))
pickle.dump(links,open('links.pkl','wb'))

# Clustering users according to their favourite Genre

In [33]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype   
---  ------     -----   
 0   userId     int64   
 1   movieId    category
 2   rating     float64 
 3   timestamp  int64   
dtypes: category(1), float64(1), int64(2)
memory usage: 670.0 MB


In [35]:
ratings_data.nunique()

userId         162541
movieId         59047
rating             10
timestamp    20115267
dtype: int64

## Selecting only those users who have liked a movie because while making favourite genre clusters we dont care about the users who have given a bad rating


In [None]:
ratings_data_good = ratings_data[ratings_data.rating>3.5]

In [40]:
ratings_data_good.nunique()

userId         162342
movieId         40858
rating              3
timestamp    10523451
dtype: int64

In [41]:
ratings_data_good.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
5,1,1088,4.0,1147868495
8,1,1237,5.0,1147868839


In [6]:
ratings_data_good.drop(['rating','timestamp'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [48]:
ratings_data_good.head()

Unnamed: 0,userId,movieId
0,1,296
2,1,307
3,1,665
5,1,1088
8,1,1237


# Grouping the movies in a list which were liked by users

In [7]:
movie_grps = ratings_data_good.groupby('userId')['movieId'].apply(list).reset_index(name='movies')

In [52]:
movie_grps.head(10)

Unnamed: 0,userId,movies
0,1,"[296, 307, 665, 1088, 1237, 1250, 1653, 2351, ..."
1,2,"[110, 150, 151, 236, 260, 318, 333, 349, 356, ..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[296, 541, 589, 924, 1036, 1136, 1196, 1197, 1..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."
5,6,"[260, 318, 527, 593, 608, 858, 902, 912, 913, ..."
6,7,"[17, 28, 58, 150, 232, 265, 296, 306, 307, 308..."
7,8,"[1, 3, 10, 16, 18, 47, 50, 62, 69, 95, 104, 11..."
8,9,"[2, 10, 61, 105, 116, 150, 158, 161, 165, 168,..."
9,10,"[32, 50, 110, 150, 260, 296, 318, 356, 480, 58..."


In [53]:
movie_grps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162342 entries, 0 to 162341
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  162342 non-null  int64 
 1   movies  162342 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


In [63]:
# Movies liked by the user with userId1
movie_grps.movies.iloc[0]

[296,
 307,
 665,
 1088,
 1237,
 1250,
 1653,
 2351,
 2573,
 2632,
 2692,
 2843,
 3448,
 3569,
 3949,
 4144,
 4325,
 4703,
 4973,
 5147,
 5767,
 5878,
 5952,
 6016,
 6370,
 6377,
 6711,
 7209,
 7234,
 7361,
 7365,
 7940,
 8154,
 8327,
 8360,
 8786,
 8973,
 27266,
 32591]

In [126]:
movie_grps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162342 entries, 0 to 162341
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  162342 non-null  int64 
 1   movies  162342 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


In [8]:
total = pd.merge(ratings_data_good,movies_data,on='movieId',how='left')

# Finding the List of all the movies that were liked by a user

In [9]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12452811 entries, 0 to 12452810
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   userId   int64 
 1   movieId  int64 
 2   title    object
 3   genres   object
dtypes: int64(2), object(2)
memory usage: 475.0+ MB


In [11]:
total.drop(['rating','timestamp'],axis=1,inplace=True)

In [10]:
movies_by_users = total.drop(['movieId','genres'],axis=1)
genres_by_users = total.drop(['movieId','title'],axis=1)

In [19]:
genres_by_users.head()

Unnamed: 0,userId,genres
0,1,Comedy|Crime|Drama|Thriller
1,1,Drama
2,1,Comedy|Drama|War
3,1,Drama|Musical|Romance
4,1,Drama


In [11]:
movies_by_users_grouped = movies_by_users.groupby('userId')['title'].apply(list).reset_index(name='movies')

In [None]:
genres_by_users_grouped = genres_by_users.groupby('userId')['genres'].apply(list).reset_index(name='genres')

# All the Movies Liked by a user

In [21]:
movies_by_users_grouped.head()

Unnamed: 0,userId,movies
0,1,"[Pulp Fiction (1994), Three Colors: Blue (Troi..."
1,2,"[Braveheart (1995), Apollo 13 (1995), Rob Roy ..."
2,3,"[Toy Story (1995), City of Lost Children, The ..."
3,4,"[Pulp Fiction (1994), Blade Runner (1982), Ter..."
4,5,"[Toy Story (1995), Ace Ventura: When Nature Ca..."


In [21]:
genres_by_users_grouped.head()

Unnamed: 0,userId,genres
0,1,"[Comedy,Crime,Drama,Thriller, Drama, Comedy,Dr..."
1,2,"[Action,Drama,War, Adventure,Drama,IMAX, Actio..."
2,3,"[Adventure,Animation,Children,Comedy,Fantasy, ..."
3,4,"[Comedy,Crime,Drama,Thriller, Action,Sci-Fi,Th..."
4,5,"[Adventure,Animation,Children,Comedy,Fantasy, ..."


In [14]:
genres_by_users_grouped['genres'] = genres_by_users_grouped['genres'].apply(lambda x:[i.replace("|",",") for i in x])

In [39]:
genres_by_users_grouped['genres'] = genres_by_users_grouped['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [45]:
genres_by_users_grouped.head()

Unnamed: 0,userId,genres,fav_genre
0,1,"[Comedy,Crime,Drama,Thriller, Drama, Comedy,Dr...",Drama
1,2,"[Action,Drama,War, Adventure,Drama,IMAX, Actio...",Drama
2,3,"[Adventure,Animation,Children,Comedy,Fantasy, ...",Drama
3,4,"[Comedy,Crime,Drama,Thriller, Action,Sci-Fi,Th...",Comedy
4,5,"[Adventure,Animation,Children,Comedy,Fantasy, ...",Comedy


In [50]:
movies_favGenre_data = pd.merge(movies_by_users_grouped,genres_by_users_grouped,on = 'userId')

In [52]:
movies_favGenre_data.drop(['genres'],axis=1,inplace=True)

In [53]:
movies_favGenre_data.head()

Unnamed: 0,userId,movies,fav_genre
0,1,"[Pulp Fiction (1994), Three Colors: Blue (Troi...",Drama
1,2,"[Braveheart (1995), Apollo 13 (1995), Rob Roy ...",Drama
2,3,"[Toy Story (1995), City of Lost Children, The ...",Drama
3,4,"[Pulp Fiction (1994), Blade Runner (1982), Ter...",Comedy
4,5,"[Toy Story (1995), Ace Ventura: When Nature Ca...",Comedy


# Finding the Favourite Genre of a User

In [36]:
from collections import Counter
 
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]

In [40]:
#genres_by_users_grouped['fav_genre'] = genres_by_users_grouped['genres'].apply(lambda x:max(set(x), key = x.count))
genres_by_users_grouped['fav_genre'] = genres_by_users_grouped['genres'].apply(most_frequent)

In [43]:
genres_by_users_grouped.head()

Unnamed: 0,userId,genres,fav_genre
0,1,"[Comedy,Crime,Drama,Thriller, Drama, Comedy,Dr...",Drama
1,2,"[Action,Drama,War, Adventure,Drama,IMAX, Actio...",Drama
2,3,"[Adventure,Animation,Children,Comedy,Fantasy, ...",Drama
3,4,"[Comedy,Crime,Drama,Thriller, Action,Sci-Fi,Th...",Comedy
4,5,"[Adventure,Animation,Children,Comedy,Fantasy, ...",Comedy


### Making the Genre Cluster

In [46]:
fav_genre_grps = genres_by_users_grouped.drop(['genres'],axis=1)

In [47]:
fav_genre_grps = fav_genre_grps.groupby('fav_genre')['userId'].apply(list).reset_index(name='genre_cluster')

In [61]:
fav_genre_grps.head(20)

Unnamed: 0,fav_genre,genre_cluster
0,(nogenreslisted),"[5921, 51483, 90439, 103148]"
1,Action,"[1793, 1906, 7072, 10861, 13639, 15387, 19375,..."
2,"Action,Adventure","[706, 4413, 5344, 6076, 6782, 6912, 7049, 7076..."
3,"Action,Adventure,Animation","[37180, 111427]"
4,"Action,Adventure,Animation,Children,Comedy","[2925, 8362, 19440, 26990, 30508, 40257, 41434..."
5,"Action,Adventure,Animation,Children,Fantasy","[46376, 131404]"
6,"Action,Adventure,Animation,Drama,Fantasy","[9795, 22698, 25290, 49480, 75664, 117741]"
7,"Action,Adventure,Animation,Horror,Sci-Fi","[16191, 34142, 54776, 59640, 69997, 75472, 768..."
8,"Action,Adventure,Animation,Sci-Fi","[50049, 65746, 73687, 117329, 135976]"
9,"Action,Adventure,Children,Comedy,Fantasy",[121050]


In [60]:
fav_genre_grps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   fav_genre      383 non-null    object
 1   genre_cluster  383 non-null    object
dtypes: object(2)
memory usage: 6.1+ KB


# Pickling the Necessary DataFrames - The structure of these files wont allow to be saved as a SQL table. Also it's not wise to normalize this data because it will only add to data latency

In [54]:
import pickle
pickle.dump(fav_genre_grps,open('fav_genre_usergrps.pkl','wb'))
pickle.dump(movies_favGenre_data,open('movies_favGenre_data.pkl','wb'))

# Creating Data for User Sentiment analysis

In [24]:
tags_data = pd.merge(movies_by_users_grouped,genres_by_users_grouped,on = 'userId')


In [26]:
tags_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162342 entries, 0 to 162341
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   userId  162342 non-null  int64 
 1   movies  162342 non-null  object
 2   genres  162342 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.0+ MB


In [30]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1093360 entries, 0 to 1093359
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   userId   1093360 non-null  int64 
 1   movieId  1093360 non-null  int64 
 2   tag      1093344 non-null  object
dtypes: int64(2), object(1)
memory usage: 25.0+ MB


In [31]:
tags.insert(1, "combo", 0)

In [34]:
tags.head()

Unnamed: 0,userId,combo,movieId,tag
0,3,263,260,classic
1,3,263,260,sci-fi
2,4,1736,1732,dark comedy
3,4,1736,1732,great dialogue
4,4,7573,7569,so bad it's good


In [33]:
tags['combo'] = tags['userId']+tags['movieId']

In [35]:
tracker = tags.drop(['tag','movieId'],axis=1)

In [36]:
new_tags = tags.drop(['userId','movieId'],axis=1)

In [37]:
comments_by_movieid = new_tags.groupby('combo')['tag'].apply(list).reset_index(name='comments')

In [38]:
comments_by_movieid.head()

Unnamed: 0,combo,comments
0,213,[funny]
1,250,[chick flick]
2,263,"[classic, sci-fi]"
3,306,"[bittersweet, boring, conversation, dialogue d..."
4,370,"[action, adventure, fantasy, space adventure]"


In [41]:
final_tags = pd.merge(comments_by_movieid,tracker,on='combo',how='left')

In [42]:
final_tags.insert(2,"movieId",0)
final_tags['movieId'] = final_tags.combo - final_tags.userId
final_tags.head()

Unnamed: 0,combo,comments,movieId,userId
0,213,[funny],3,210
1,250,[chick flick],17,233
2,263,"[classic, sci-fi]",260,3
3,263,"[classic, sci-fi]",260,3
4,306,"[bittersweet, boring, conversation, dialogue d...",215,91


In [43]:
final_tags.drop(['combo'],axis=1,inplace=True)

In [44]:
final_tags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1093360 entries, 0 to 1093359
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   comments  1093360 non-null  object
 1   movieId   1093360 non-null  int64 
 2   userId    1093360 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 33.4+ MB


In [45]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype   
---  ------     -----   
 0   userId     int64   
 1   movieId    category
 2   rating     float64 
 3   timestamp  int64   
dtypes: category(1), float64(1), int64(2)
memory usage: 670.0 MB


In [46]:
tags_data.head()

Unnamed: 0,userId,movies,genres
0,1,"[Pulp Fiction (1994), Three Colors: Blue (Troi...","[Comedy|Crime|Drama|Thriller, Drama, Comedy|Dr..."
1,2,"[Braveheart (1995), Apollo 13 (1995), Rob Roy ...","[Action|Drama|War, Adventure|Drama|IMAX, Actio..."
2,3,"[Toy Story (1995), City of Lost Children, The ...","[Adventure|Animation|Children|Comedy|Fantasy, ..."
3,4,"[Pulp Fiction (1994), Blade Runner (1982), Ter...","[Comedy|Crime|Drama|Thriller, Action|Sci-Fi|Th..."
4,5,"[Toy Story (1995), Ace Ventura: When Nature Ca...","[Adventure|Animation|Children|Comedy|Fantasy, ..."


In [47]:
tags_data['genres'] = tags_data['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [48]:
tags_data['movies'] = tags_data['movies'].apply(lambda x:[i.replace(" ","") for i in x])

In [51]:
tags_data['tags'] = tags_data.movies + tags_data.genres

In [53]:
new_df = tags_data[['userId','tags']]

### Tags created for NLP, Not required for the project related tasks..

In [54]:
new_df.head()

Unnamed: 0,userId,tags
0,1,"[PulpFiction(1994), ThreeColors:Blue(Troiscoul..."
1,2,"[Braveheart(1995), Apollo13(1995), RobRoy(1995..."
2,3,"[ToyStory(1995), CityofLostChildren,The(Citéde..."
3,4,"[PulpFiction(1994), BladeRunner(1982), Termina..."
4,5,"[ToyStory(1995), AceVentura:WhenNatureCalls(19..."


In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [58]:
import pickle
pickle.dump(new_df,open('tags_data.pkl','wb'))

In [62]:
ratings_data.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [66]:
ratings_data.rating[(ratings_data.userId==1) & (ratings_data.movieId==306)][1]

3.5

In [68]:
pickle.dump(ratings_data,open('ratings.pkl','wb'))

In [80]:
movies_data.movieId[movies_data.title=="#Captured (2017)"]

51787    183901
Name: movieId, dtype: int64

In [86]:
z = movies_data[movies_data.title=="#Captured (2017)"].index[0]
movies_data.movieId[z]

183901

In [88]:
movies_data.loc[movies_data.movieId==296]

Unnamed: 0,movieId,title,genres
292,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
