## Basics data frame

In [1]:
import warnings
warnings.filterwarnings('ignore')
import json

In [2]:
import pandas as pd
# load data - change the path to match your drive!
basics = pd.read_csv('Data/title_basics.csv.gz')
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86707 entries, 0 to 86706
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86707 non-null  object 
 1   titleType       86707 non-null  object 
 2   primaryTitle    86707 non-null  object 
 3   originalTitle   86707 non-null  object 
 4   isAdult         86707 non-null  int64  
 5   startYear       86707 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86707 non-null  int64  
 8   genres          86707 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.0+ MB


### Removing unwanted columns from Basics

In [3]:
basics = basics.drop(columns=['titleType', 'originalTitle','isAdult','endYear'])
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86707 entries, 0 to 86706
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86707 non-null  object 
 1   primaryTitle    86707 non-null  object 
 2   startYear       86707 non-null  float64
 3   runtimeMinutes  86707 non-null  int64  
 4   genres          86707 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.3+ MB


## Ratings Dataframe

In [4]:
ratings = pd.read_csv('Data/title_ratings.csv.gz')
ratings.head()
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499155 entries, 0 to 499154
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         499155 non-null  object 
 1   averageRating  499155 non-null  float64
 2   numVotes       499155 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.4+ MB


### TMDB_API Dataframe

In [5]:
tmdb_api = pd.read_csv('Data/tmdb_results_combined.csv.gz')
tmdb_api.head()
tmdb_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2569 non-null   object 
 1   adult                  2568 non-null   float64
 2   backdrop_path          1400 non-null   object 
 3   belongs_to_collection  205 non-null    object 
 4   budget                 2568 non-null   float64
 5   genres                 2568 non-null   object 
 6   homepage               172 non-null    object 
 7   id                     2568 non-null   float64
 8   original_language      2568 non-null   object 
 9   original_title         2568 non-null   object 
 10  overview               2518 non-null   object 
 11  popularity             2568 non-null   float64
 12  poster_path            2310 non-null   object 
 13  production_companies   2568 non-null   object 
 14  production_countries   2568 non-null   object 
 15  rele

### Removing unwanted columns from tmdb_api dataframe

In [6]:
tmdb_api = tmdb_api.drop(columns=['adult', 'backdrop_path', 'belongs_to_collection', 'genres','homepage', 'id',
                                  'original_language', 'original_title', 'overview','popularity','poster_path',
                                  'production_companies', 'production_countries','release_date','runtime','spoken_languages',
                                  'status','tagline','title','video','vote_average','vote_count'])
tmdb_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        2569 non-null   object 
 1   budget         2568 non-null   float64
 2   revenue        2568 non-null   float64
 3   certification  816 non-null    object 
dtypes: float64(2), object(2)
memory usage: 80.4+ KB


### Create a genres table

In [7]:
## adding expand=True
basics['genres_split'] = basics['genres'].str.split(',')

In [8]:
## get the unique ids
unique_genres = sorted(basics['genres'].unique())
unique_genres[:2]

['Action', 'Action,Adult,Drama']

In [9]:
## exploding the column of lists
exploded = basics.explode('genres_split')
exploded.head()
exploded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161970 entries, 0 to 86706
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          161970 non-null  object 
 1   primaryTitle    161970 non-null  object 
 2   startYear       161970 non-null  float64
 3   runtimeMinutes  161970 non-null  int64  
 4   genres          161970 non-null  object 
 5   genres_split    161970 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 8.7+ MB


In [10]:
## saving the unique values from the exploded column
cols_to_make = exploded['genres_split'].dropna().unique()
cols_to_make

array(['Comedy', 'Fantasy', 'Romance', 'Drama', 'Horror', 'Sci-Fi',
       'Biography', 'Mystery', 'Musical', 'Action', 'Adventure', 'Crime',
       'Thriller', 'Music', 'Animation', 'Family', 'History', 'War',
       'Sport', 'Western', 'Adult', 'Short', 'Reality-TV', 'News',
       'Talk-Show', 'Game-Show'], dtype=object)

In [14]:
for col in cols_to_make:
    basics[col] = basics['genres'].str.contains(col)
basics.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,genres_split,Comedy,Fantasy,Romance,Drama,...,History,War,Sport,Western,Adult,Short,Reality-TV,News,Talk-Show,Game-Show
0,tt0035423,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama,,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,tt0069049,The Other Side of the Wind,2018.0,122,Drama,,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,tt0088751,The Naked Monster,2005.0,100,"Comedy,Horror,Sci-Fi",,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,tt0096056,Crime and Punishment,2002.0,126,Drama,,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [11]:
## showing the lists are really strings
basics.loc[0,'genres_split']

['Comedy', 'Fantasy', 'Romance']

In [12]:
# Create a new column where the single quotes are replaced by double quotes
basics['genres_split'] = basics['genres_split'].str.replace("'",'"')

In [13]:
# Apply json.loads to entire column
basics['genres_split'] = basics['genres_split'].apply(json.loads)
# check results 
basics['genres_split'].head()

TypeError: the JSON object must be str, bytes or bytearray, not float

In [None]:
## get the unique ids
unique_genres = sorted(basics['genres_split'].unique())
unique_genres[:3]

In [None]:
for col in cols_to_make:
    basics[col] = basics['genres'].str.contains(col)
basics.head()

In [None]:
# drop transactions clumns
basics = basics.drop(columns=['genres','genres_split'])
basics.head()

#### Separate genres into a column

### Genres separation and creating columns

In [None]:
## get the unique ids
unique_basics = sorted(basics['tconst'].unique())
unique_basics[:2]

In [None]:
## make integers for each id
int_basics = range(len(unique_basics))
int_basics

In [None]:
## Converting our range to a list and showing the first 10 values
example_range = list(int_basics)
example_range[:10]

In [None]:
# Zip together the unique_ids as the keys and the int_ids as the values
id_map = dict(zip(unique_basics,int_basics))
id_map

In [None]:
## demonstrating using id_map to get iteger id 
basics_str_id ="tt0118141"
id_map[basics_str_id]

In [None]:
basics['tconst'].map(id_map)

In [None]:
## overwriting the original id column 
basics['tconst'] = basics['tconst'].replace(id_map)

basics.head(3)
basics.info()

In [None]:
# Using pd.DataFrame and a dictionary
id_lookup = pd.DataFrame({'str_id': id_map.keys(),
                         'int_id':id_map.values()})
id_lookup.head(3)
id_lookup.info()

In [None]:
genres['genres_id']=genres['genres_id'].astype(str)
## Merging with the same column name
title_genres = pd.merge(id_lookup, genres, left_on= 'int_id', right_on='genres_id')

In [None]:
## Set the dataframe index and use index=True 
df.set_index('int_index').to_sql('table_name',engine,index=True)



In [None]:
title_genres.drop(columns=['genres_name','primaryTitle','startYear',
                           'runtimeMinutes','genres'],inplace=True)
title_genres.head()

In [None]:
gdgh

In [None]:
## exploding the column of lists
exploded = basics.explode('genre_split')
exploded.head()

In [None]:
# Using pd.DataFrame and a dictionary
genres = pd.DataFrame({'genres_id':genres_map.values(),
                         'genres_name': genres_map.keys()})
genres.head(3)
genres.info()

In [None]:
## saving the unique values from the exploded column
cols_to_make = exploded['genre_split'].dropna().unique()
cols_to_make

In [None]:
for col in cols_to_make:
    basics[col] = basics['genres'].str.contains(col)
basics.head()

In [None]:
# drop transactions clumns
basics = basics.drop(columns=['genres','genre_split'])
basics.head()
basics.info()

In [None]:
## save data for next lesson
basics.to_csv('basics.csv', index=False)

In [None]:
tmdb_api.head(2)

In [None]:
tmdb_api = tmdb_api.rename(columns ={'imdb_id':'tconst'})
tmdb_api.head()
tmdb_api.info()

In [None]:
tmdb_api = pd.read_csv('Data/tmdb_results_combined.csv.gz')
tmdb_api.info()

In [None]:
## save data for next lesson
tmdb_api.to_csv('title_tmdb_api.csv', index=False)

In [None]:
import glob
q = "Data/title*.csv.gz"
chunked_files = sorted(glob.glob(q))
# Showing the first 5 
chunked_files[:5]

## SQL

In [None]:
# imports
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote
# Create connection string using credentials following this format

In [None]:
import json
with open('/Users/miran/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

In [None]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/movie"

In [None]:
# create sqlite engine for a database called schools
engine = create_engine(connection)
# create connection to engine
conn = engine.connect()

In [None]:
database_exists(connection)

In [None]:
movie = pd.read_csv('Data/title_basics.csv.gz')
movie.info()

In [None]:
## Check if database exists, if not, create it
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created!')

In [None]:
movie.to_sql('movie', engine, if_exists = 'replace')

In [None]:
q = """SELECT * FROM movie;"""
pd.read_sql(q, engine)

In [None]:
q = '''SHOW TABLES'''
pd.read_sql(q,engine)
