# Example for data science
Purpose of this exercise is download the dataset, save it to a database and answer few questions.
<br />
Dataset source <link>https://files.grouplens.org/datasets/movielens/ml-latest-small.zip</link>

### Data preparation 
1. Download dataset
2. Unzip dataset
3. Load the data
4. Save it to database

### Question to be answered : 
1. How many movies are in data set ?
2. What is the most common genre of movie?
3. What are top 10 movies with highest rate ?
4. What are 5 most often rating users ?
5. When was done first and last rate included in data set and what was the rated movie tittle?
6. Find all movies released in 1990

### Downloading dataset

In [1]:
import os
import requests

if os.path.exists('data') == False:
    os.mkdir('data')

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
r = requests.get(url, allow_redirects=True, verify=False)
open('data/ml-latest-small.zip', 'wb').write(r.content)



978202

### Unzip dataset

In [2]:
import zipfile

with zipfile.ZipFile('data/ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data/unzipped')

### Load the data
While loading data, we remove NAN values to have a pure dataset

In [3]:
import pandas as pd
import numpy as np

links = pd.read_csv("data/unzipped/ml-latest-small/links.csv").dropna()
movies = pd.read_csv("data/unzipped/ml-latest-small/movies.csv").dropna()
ratings = pd.read_csv("data/unzipped/ml-latest-small/ratings.csv").dropna()
tags = pd.read_csv("data/unzipped/ml-latest-small/tags.csv").dropna()

In [4]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


### Save it to database

In [8]:
from sqlalchemy import create_engine
import pymysql
from sqlalchemy.types import Integer, Text, String, DateTime, Float

engine = create_engine('mysql+pymysql://root:password@127.0.0.1:3306/exercise')


In [9]:
movies.to_sql(
    'movies',
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={
        "movieId": Integer,
        "title": Text,
        "genres": Text
    }
)

OperationalError: (pymysql.err.OperationalError) (1049, "Unknown database 'test'")
(Background on this error at: http://sqlalche.me/e/14/e3q8)

In [None]:
links.to_sql(
    'links',
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={
        "movieId": Integer,
        "imdbId": Integer,
        "tmdbId": Float
    }
)

In [None]:
ratings.to_sql(
    'ratings',
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={
        "userId": Integer,
        "movieId": Integer,
        "rating": Float,
        "timestamp": Integer
    }
)

In [None]:
tags.to_sql(
    'tags',
    engine,
    if_exists='replace',
    index=False,
    chunksize=500,
    dtype={
        "userId": Integer,
        "movieId": Integer,
        "tag": Text,
        "timestamp": Integer
    }
)

### Load from database
For the sake of the exercise, we load data from database.

In [None]:
movies = pd.read_sql_table(
    'movies',
    con=engine
)
movies.info()

In [None]:
links = pd.read_sql_table(
    'links',
    con=engine
)
links.info()

In [None]:
ratings = pd.read_sql_table(
    'ratings',
    con=engine
)
ratings.info()

In [None]:
tags = pd.read_sql_table(
    'tags',
    con=engine
)
tags.info()

## Questions and Answers

### Question 1) How many movies are in data set ?

In [None]:
numberOfDistinctMovieTitles = len(movies['title'].dropna().unique())
print(f'Number of distinct movie titles is {numberOfDistinctMovieTitles}')

### Question 2) What is the most common genre of movie?

In [None]:
genres = movies['genres'].str.split(pat="|")
counter = {}

for genreList in genres:
    for genre in genreList:
        if genre not in counter:
            counter[genre] = 1
        else:
            counter[genre] += 1
counter

In [None]:
import operator

mostCommonGenre = max(counter.items(), key=operator.itemgetter(1))[0]

print(f'Most common genre is {mostCommonGenre}')

### Question 3) What are top 10 movies with highest rate ?

In [None]:
movies.set_index('movieId').join(ratings.set_index('movieId')).groupby('title').mean('rating').sort_values(by=['rating'], ascending=False)[:10]['rating']

### Question 4) What are 5 most often rating users ?
'Most often' is hard to answer but most is simple

In [None]:
ratings['userId'].value_counts()[:10]

### Question 5) When was done first and last rate included in data set and what was the rated movie tittle?

In [None]:
first = ratings[ratings.timestamp == ratings.timestamp.min()].head(1)
last = ratings[ratings.timestamp == ratings.timestamp.max()].head(1)

In [None]:
first

In [None]:
last

In [None]:
firstMovieRated = movies[movies.movieId == first.movieId.values[0]]
firstMovieRated

In [None]:
lastMovieRated = movies[movies.movieId == last.movieId.values[0]]
lastMovieRated

In [None]:
print(f'First movie rated {firstMovieRated["title"].values[0]}')
print(f'Last movie rated {lastMovieRated["title"].values[0]}')

### Question 6) Find all movies released in 1990
Only place that we can receive movie release dates is in movies table's title column

In [None]:
moviesFrom1990 = []
for title in movies["title"].values:
    year = title[-5:-1]
    if year == '1990':
        moviesFrom1990.append(title)
moviesFrom1990