In [1]:
import pandas as pd

##### READING THE MOVIE.CSV FILE

In [4]:
df = pd.read_csv("movies.csv")
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


##### REMOVING MOVIES WITH NO GENRES LISTED

In [18]:
df.loc[df["genres"]!="(no genres listed)"]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62417,209155,Santosh Subramaniam (2008),Action|Comedy|Romance
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama


##### REMOVING MOVIES WHONE NAMES CANNOT BE TYPED ON THE KEYBOARD
MOSTLY REMOVING THE RUSSIAN,CHINESE AND JAPANESE MOVIES WHICH CANNOT BE TYPED ON THE KEYBOARD

In [19]:
import re

In [59]:
cleaned_df = df.iloc[0:0,:]
for i in range(df.shape[0]):
    title = df.iloc[i,1]
    x = re.findall("[^a-zA-Z0-9()\'\",:\.\-!&\/?éàè+³#%\*½ ]", title)
    if x:
        continue
    else:
        cleaned_df = cleaned_df.append(df.iloc[i,:])

In [60]:
cleaned_df.shape

(60680, 3)

In [65]:
cleaned_df = cleaned_df.loc[cleaned_df["genres"]!= "(no genres listed)"]

In [66]:
cleaned_df.shape

(55804, 3)

##### SAVING THE CLEANED DATAFRAME

In [67]:
cleaned_df.to_csv("cleaned_movie.csv",index=False)

##### READING THE RATINGS.CSV FILE

In [68]:
ratings = pd.read_csv("ratings.csv")

In [69]:
ratings.shape

(25000095, 4)

In [70]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [71]:
ratings["userId"].unique()

array([     1,      2,      3, ..., 162539, 162540, 162541], dtype=int64)

##### DROPPING THE TIMESTAMP COLUMN AS IT DOESN'T HELP IN MAKING A RECOMMENDATION

In [73]:
ratings.drop(columns = ["timestamp"],inplace= True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [77]:
mov_id = cleaned_df["movieId"].tolist()

##### REMOVING MOVIES RATINGS OF MOVIES WHICH ARE NOT IN THE CLEANED MOVIES DATAFRAME

In [79]:
ratings = ratings.loc[ratings["movieId"].isin(mov_id)]

In [81]:
ratings.to_csv("cleaned_ratings.csv",index=False)

In [2]:
links = pd.read_csv("links.csv")
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [3]:
links.drop(columns = ['tmdbId'],inplace = True)
links.head()

Unnamed: 0,movieId,imdbId
0,1,114709
1,2,113497
2,3,113228
3,4,114885
4,5,113041


In [4]:
from bs4 import BeautifulSoup
import requests

def get_poster(imdb_id):
    imdb_id = str(imdb_id)
    while len(imdb_id)<7:
        imdb_id = '0'+imdb_id
    url = "https://www.imdb.com/title/tt"+imdb_id+"/"
    req = requests.get(url).text
    soup = BeautifulSoup(req,"lxml")
    for div in soup.find_all("div",class_ = "poster"):
        a_tag = div.find('a')
        img_tag = a_tag.find('img')
        return img_tag["src"]
    return ''

In [5]:
movies = pd.read_csv("movies.csv")
movies.tail()

Unnamed: 0,movieId,title,genres
9995,203519,Fast & Furious Presents: Hobbs & Shaw (2019),Action
9996,204352,Ad Astra (2019),Adventure|Drama|Mystery|Sci-Fi|Thriller
9997,204542,It: Chapter Two (2019),Horror
9998,204698,Joker (2019),Crime|Drama|Thriller
9999,205383,El Camino: A Breaking Bad Movie (2019),Crime|Drama|Thriller


In [8]:
mov_id = movies["movieId"].tolist()

In [11]:
links = links.loc[links["movieId"].isin(mov_id)]
links = links.reset_index(drop=True)
links.tail()

Unnamed: 0,movieId,imdbId
9995,203519,6806448
9996,204352,2935510
9997,204542,7349950
9998,204698,7286456
9999,205383,9243946


In [24]:
poster_link = []
for i in range(links.shape[0]):
    imdb_id = links.iloc[i,1]
    poster_link.append(get_poster(imdb_id))

In [20]:
links.to_csv("links.csv",index=False)

In [31]:
links["poster_img"] = poster_links

In [32]:
links.head()

Unnamed: 0,movieId,imdbId,poster_img
0,1,114709,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,113497,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,113228,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,114885,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,113041,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [35]:
movies = movies.merge(links,left_on = "movieId",right_on="movieId")
movies.tail()

Unnamed: 0,movieId,title,genres,imdbId,poster_img
9995,203519,Fast & Furious Presents: Hobbs & Shaw (2019),Action,6806448,https://m.media-amazon.com/images/M/MV5BOTIzYm...
9996,204352,Ad Astra (2019),Adventure|Drama|Mystery|Sci-Fi|Thriller,2935510,https://m.media-amazon.com/images/M/MV5BZTllZT...
9997,204542,It: Chapter Two (2019),Horror,7349950,https://m.media-amazon.com/images/M/MV5BYTJlNj...
9998,204698,Joker (2019),Crime|Drama|Thriller,7286456,https://m.media-amazon.com/images/M/MV5BNGVjNW...
9999,205383,El Camino: A Breaking Bad Movie (2019),Crime|Drama|Thriller,9243946,https://m.media-amazon.com/images/M/MV5BNjk4Mz...


In [42]:
for j in range(20):
    print(movies.iloc[j,1])
    print(movies.iloc[j,4])

Toy Story (1995)
https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_UX182_CR0,0,182,268_AL__QL50.jpg
Jumanji (1995)
https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UY268_CR10,0,182,268_AL__QL50.jpg
Grumpier Old Men (1995)
https://m.media-amazon.com/images/M/MV5BMjQxM2YyNjMtZjUxYy00OGYyLTg0MmQtNGE2YzNjYmUyZTY1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL__QL50.jpg
Waiting to Exhale (1995)
https://m.media-amazon.com/images/M/MV5BYzcyMDY2YWQtYWJhYy00OGQ2LTk4NzktYWJkNDYwZWJmY2RjXkEyXkFqcGdeQXVyMTA0MjU0Ng@@._V1_UX182_CR0,0,182,268_AL__QL50.jpg
Father of the Bride Part II (1995)
https://m.media-amazon.com/images/M/MV5BOTEyNzg5NjYtNDU4OS00MWYxLWJhMTItYWU4NTkyNDBmM2Y0XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL__QL50.jpg
Heat (1995)
https://m.media-amazon.com/images/M/MV5BMDJjNWE5MTEtMDk2Mi00ZjczLWIwYjAtNzM2ZTdhNzcwOGZjXkEyXkFqcGdeQ

In [43]:
movies.head()

Unnamed: 0,movieId,title,genres,imdbId,poster_img
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II (1995),Comedy,113041,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [44]:
movies.to_csv("movies.csv")