In [149]:
import pandas as pd
import numpy as np
import re

## Import data

In [150]:
movies = pd.read_csv("ml-25m/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Data Wrangling

In [151]:
# replace missing genre with np.nan
movies["genres"] = movies["genres"].replace("(no genres listed)", np.nan)
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),


In [152]:
# count amount of nan
movies.isna().sum()

movieId       0
title         0
genres     5062
dtype: int64

In [180]:
# extract year
movies['year'] = movies['title'].str.extract(r"\((\d{4})\)", expand = False)
movies

Unnamed: 0,title,year,genres
0,Toy Story (1995),1995,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),1995,Adventure|Children|Fantasy
2,Grumpier Old Men (1995),1995,Comedy|Romance
3,Waiting to Exhale (1995),1995,Comedy|Drama|Romance
4,Father of the Bride Part II (1995),1995,Comedy
...,...,...,...
62417,Santosh Subramaniam (2008),2008,Action|Comedy|Romance
62418,We (2018),2018,Drama
62419,Window of the Soul (2001),2001,Documentary
62420,Bad Poems (2018),2018,Comedy|Drama


In [181]:
# remove all nan rows
movies = movies[movies["genres"].notna()]
movies = movies[movies["year"].notna()]
movies

Unnamed: 0,title,year,genres
0,Toy Story (1995),1995,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),1995,Adventure|Children|Fantasy
2,Grumpier Old Men (1995),1995,Comedy|Romance
3,Waiting to Exhale (1995),1995,Comedy|Drama|Romance
4,Father of the Bride Part II (1995),1995,Comedy
...,...,...,...
62417,Santosh Subramaniam (2008),2008,Action|Comedy|Romance
62418,We (2018),2018,Drama
62419,Window of the Soul (2001),2001,Documentary
62420,Bad Poems (2018),2018,Comedy|Drama


In [182]:
# year to int
movies["year"] = movies["year"].astype("int")

In [183]:
movies["year"].dtype

dtype('int32')

In [184]:
# reorder columns
movies = movies[["title", "year", "genres"]]
movies

Unnamed: 0,title,year,genres
0,Toy Story (1995),1995,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),1995,Adventure|Children|Fantasy
2,Grumpier Old Men (1995),1995,Comedy|Romance
3,Waiting to Exhale (1995),1995,Comedy|Drama|Romance
4,Father of the Bride Part II (1995),1995,Comedy
...,...,...,...
62417,Santosh Subramaniam (2008),2008,Action|Comedy|Romance
62418,We (2018),2018,Drama
62419,Window of the Soul (2001),2001,Documentary
62420,Bad Poems (2018),2018,Comedy|Drama


In [185]:
print(movies["year"].unique())

[1995 1994 1996 1976 1992 1988 1967 1993 1964 1977 1965 1982 1990 1991
 1989 1937 1940 1969 1981 1973 1970 1960 1955 1959 1968 1980 1975 1986
 1948 1943 1950 1946 1987 1997 1974 1956 1958 1949 1972 1998 1933 1952
 1951 1957 1961 1954 1934 1944 1963 1942 1941 1953 1939 1947 1945 1938
 1935 1936 1926 1932 1985 1979 1971 1978 1966 1962 1983 1984 1931 1922
 1999 1927 1929 1930 1928 1925 1914 2000 1919 1923 1920 1918 1921 2001
 1924 2002 2003 1915 2004 1916 1917 2005 2006 1902 1903 2007 2008 2009
 1912 2010 1913 2011 1898 1899 1894 2012 1910 2013 1896 2014 2015 1895
 1909 1911 1900 2016 2017 2018 2019 1905 1904 1892 1908 1888 1890 1874
 1891 1901 1907 1906 1897 1880]


In [204]:
movies_2010 = movies.loc[(movies["year"] >= 2010) & (movies["year"] <= 2020)]

In [205]:
movies_2010

Unnamed: 0,title,year,genres
14156,Daybreakers (2010),2010,Action|Drama|Horror|Thriller
14161,Leap Year (2010),2010,Comedy|Romance
14162,"Book of Eli, The (2010)",2010,Action|Adventure|Drama
14222,If You Love (Jos rakastat) (2010),2010,Drama|Musical|Romance
14256,Legion (2010),2010,Action|Fantasy|Horror|Thriller
...,...,...,...
62406,Last Days of the Arctic (2011),2011,Documentary
62412,The Painting (2019),2019,Animation|Documentary
62413,Liberté (2019),2019,Drama
62418,We (2018),2018,Drama
