## Basics data frame

In [1]:
import pandas as pd
# load data - change the path to match your drive!
basics = pd.read_csv('Data/title_basics.csv.gz')
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86707 entries, 0 to 86706
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86707 non-null  object 
 1   titleType       86707 non-null  object 
 2   primaryTitle    86707 non-null  object 
 3   originalTitle   86707 non-null  object 
 4   isAdult         86707 non-null  int64  
 5   startYear       86707 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86707 non-null  int64  
 8   genres          86707 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.0+ MB


### Removing unwanted columns from Basics

In [2]:
basics = basics.drop(columns=['titleType', 'originalTitle','isAdult','endYear'])
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86707 entries, 0 to 86706
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86707 non-null  object 
 1   primaryTitle    86707 non-null  object 
 2   startYear       86707 non-null  float64
 3   runtimeMinutes  86707 non-null  int64  
 4   genres          86707 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.3+ MB


In [3]:
## get the unique ids
unique_genres = sorted(basics['genres'].unique())
unique_genres[:2]

['Action', 'Action,Adult,Drama']

In [4]:
## make integers for each id
genres_basics = range(len(unique_genres))
genres_basics

range(0, 863)

In [5]:
## Converting our range to a list and showing the first 10 values
example_range = list(genres_basics)
example_range[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [6]:
# Zip together the unique_ids as the keys and the int_ids as the values
genres_map = dict(zip(unique_genres,genres_basics))
genres_map

{'Action': 0,
 'Action,Adult,Drama': 1,
 'Action,Adventure': 2,
 'Action,Adventure,Animation': 3,
 'Action,Adventure,Biography': 4,
 'Action,Adventure,Comedy': 5,
 'Action,Adventure,Crime': 6,
 'Action,Adventure,Drama': 7,
 'Action,Adventure,Family': 8,
 'Action,Adventure,Fantasy': 9,
 'Action,Adventure,History': 10,
 'Action,Adventure,Horror': 11,
 'Action,Adventure,Music': 12,
 'Action,Adventure,Musical': 13,
 'Action,Adventure,Mystery': 14,
 'Action,Adventure,Romance': 15,
 'Action,Adventure,Sci-Fi': 16,
 'Action,Adventure,Sport': 17,
 'Action,Adventure,Thriller': 18,
 'Action,Adventure,War': 19,
 'Action,Adventure,Western': 20,
 'Action,Animation': 21,
 'Action,Animation,Comedy': 22,
 'Action,Animation,Crime': 23,
 'Action,Animation,Drama': 24,
 'Action,Animation,Family': 25,
 'Action,Animation,Fantasy': 26,
 'Action,Animation,Game-Show': 27,
 'Action,Animation,Horror': 28,
 'Action,Animation,Musical': 29,
 'Action,Animation,Mystery': 30,
 'Action,Animation,Romance': 31,
 'Action,A

In [7]:
## demonstrating using id_map to get iteger id 
basics_str_genres ="Drama"
genres_map[basics_str_genres]

587

In [8]:
basics['genres'].map(genres_map)

0        475
1        587
2        587
3        492
4        587
        ... 
86702    587
86703    444
86704    587
86705     18
86706    614
Name: genres, Length: 86707, dtype: int64

In [9]:
## overwriting the original id column 
basics['genres'] = basics['genres'].replace(genres_map)

basics.head(3)

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
0,tt0035423,Kate & Leopold,2001.0,118,475
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,587
2,tt0069049,The Other Side of the Wind,2018.0,122,587


In [14]:
# Using pd.DataFrame and a dictionary
genres = pd.DataFrame({'genre_id':genres_map.values(),
                       'genre_name': genres_map.keys()})
genres.head(3)
genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 863 entries, 0 to 862
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   genre_id    863 non-null    int64 
 1   genre_name  863 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.6+ KB


### Genres separation and creating columns

In [20]:
## examining a single value from the coordinates col
genre = genre.loc[0,"genre_name"]
print(type(genre))
genre

NameError: name 'genre' is not defined

In [19]:
# Make a list of all characters to replace
to_replace = ['(',')']
# run a loop to replace all of the characters in the list at once
for char in to_replace:
    genre['genre_name'] = genre['genre_name'].str.replace(char,'',regex=False)
    
genre['genre_name'].head()

NameError: name 'genre' is not defined

In [16]:
## adding expand=True
basics['genre_name'] = basics['genre_name'].str.split(',')

KeyError: 'genre_name'

In [None]:
## get the unique ids
unique_genres = sorted(basics['genres'].unique())
unique_genres[:2]

In [None]:
## exploding the column of lists
exploded = basics.explode('genres_split')
exploded.head()

In [None]:
## saving the unique values from the exploded column
cols_to_make = exploded['genres_split'].dropna().unique()
cols_to_make

In [None]:
for col in cols_to_make:
    basics[col] = basics['genres'].str.contains(col)
basics.head()

In [None]:
# drop transactions clumns
basics = basics.drop(columns=['genres','genres_split'])
basics.head()
basics.info()

In [None]:
## save data for next lesson
basics.to_csv('basics.csv', index=False)

## Ratings Dataframe

In [None]:
ratings = pd.read_csv('Data/title_ratings.csv.gz')
ratings.head()
ratings.info()

In [None]:
## save data for next lesson
ratings.to_csv('ratings.csv', index=False)

### TMDB_API Dataframe

### Removing unwanted columns from Ratings

In [None]:
tmdb_api = tmdb_api.drop(columns=['adult', 'backdrop_path', 'belongs_to_collection', 'genres','homepage', 'id',
                                  'original_language', 'original_title', 'overview','popularity','poster_path',
                                  'production_companies', 'production_countries','release_date','runtime','spoken_languages',
                                  'status','tagline','title','video','vote_average','vote_count'])
tmdb_api.info()

In [None]:
tmdb_api.head(2)

In [None]:
tmdb_api = tmdb_api.rename(columns ={'imdb_id':'tconst'})
tmdb_api.head()
tmdb_api.info()

In [None]:
tmdb_api = pd.read_csv('Data/tmdb_results_combined.csv.gz')
tmdb_api.info()

In [None]:
## save data for next lesson
tmdb_api.to_csv('title_tmdb_api.csv', index=False)

In [None]:
import glob
q = "Data/title*.csv.gz"
chunked_files = sorted(glob.glob(q))
# Showing the first 5 
chunked_files[:5]

## SQL

In [None]:
# imports
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote
# Create connection string using credentials following this format

In [None]:
import json
with open('/Users/miran/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

In [None]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/movie"

In [None]:
# create sqlite engine for a database called schools
engine = create_engine(connection)
# create connection to engine
conn = engine.connect()

In [None]:
database_exists(connection)

In [None]:
movie = pd.read_csv('Data/title_basics.csv.gz')
movie.info()

In [None]:
## Check if database exists, if not, create it
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created!')

In [None]:
movie.to_sql('movie', engine, if_exists = 'replace')

In [None]:
q = """SELECT * FROM movie;"""
pd.read_sql(q, engine)

In [None]:
q = '''SHOW TABLES'''
pd.read_sql(q,engine)
