# Extraction Notebook

This notebook steps will help to access the gz raw datasets from IMDB, store them in the 'extracted' folder, filter 
them and store them in the folder "clean" in order to access them later.

## Extraction Steps

In [1]:
import os
import gzip
import pandas as pd

In [2]:
# Specify directories
base_folder = os.getcwd()[:-10]
raw_dir = os.path.join(base_folder, 'raw')
extracted_dir = os.path.join(base_folder, 'extracted')
clean_dir = os.path.join(base_folder, 'clean')

In [3]:
files_zipped =['title.ratings.tsv.gz', 'title.basics.tsv.gz', 'title.crew.tsv.gz', 'name.basics.tsv.gz']

In [4]:
# First check if there is already data downloaded
files_in_folder = os.listdir(extracted_dir)

if 'title.ratings.tsv' in files_in_folder and 'title.basics.tsv' in files_in_folder \
    and 'title.crew.tsv' in files_in_folder and 'name.basics.tsv' in files_in_folder:
    df_ratings = pd.read_csv(os.path.join(extracted_dir, 'title.ratings.tsv'), sep= '\t')
    df_basics = pd.read_csv(os.path.join(extracted_dir, 'title.basics.tsv'), sep= '\t')
    df_crew = pd.read_csv(os.path.join(extracted_dir, 'title.crew.tsv'), sep= '\t')
    df_names = pd.read_csv(os.path.join(extracted_dir, 'name.basics.tsv'), sep= '\t')

 # If the data has not been downloaded, perform extraction
else:
    for file in files_zipped:
        # Extract zipped content
        gzipped_file_path = os.path.join(base_folder, raw_dir, file)

        with gzip.open(gzipped_file_path, 'rt') as gzipped_file:
        # Specify the path to the extracted TSV file (remove '.gz' extension)
            extracted_file_path = os.path.join(extracted_dir, file[:-3])

        # Write the extracted content to the TSV file
            with open(extracted_file_path, 'w') as extracted_file:
                extracted_file.write(gzipped_file.read())

            if file == 'title.ratings.tsv.gz':
                df_ratings = pd.read_csv(extracted_file_path, sep='\t')
            elif file == 'title.crew.tsv.gz':
                df_crew = pd.read_csv(extracted_file_path, sep='\t')
            elif file == 'name.basics.tsv.gz':
                df_names = pd.read_csv(extracted_file_path, sep='\t')
            else:
                df_basics = pd.read_csv(extracted_file_path, sep='\t')


  df_basics = pd.read_csv(os.path.join(extracted_dir, 'title.basics.tsv'), sep= '\t')


## Cleaning Steps

### Basics File

In [5]:
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
# Consider just movies
df_basics = df_basics[df_basics['titleType']=='movie']

In [7]:
# Drop Original Title and end year
df_basics.drop(columns=['originalTitle', 'endYear'], inplace= True)

In [8]:
# Drop rows with Start year, run time minutes and genres null
df_basics = df_basics[df_basics['startYear']!="\\N"]
df_basics = df_basics[df_basics['runtimeMinutes']!="\\N"]
df_basics = df_basics[df_basics['genres']!="\\N"]

In [9]:
# Convert startYear, runtimeMinutes  columns to integer
df_basics['startYear']=df_basics['startYear'].astype('int32')
df_basics['runtimeMinutes']=df_basics['runtimeMinutes'].astype('int32')

In [10]:
# Consider just movies filmed between 1970 and 2022
df_basics= df_basics[(df_basics['startYear']>1970) & (df_basics['startYear']<2023)]

### Crew File

In [11]:
df_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [12]:
# Drop Writers
df_crew.drop(columns=['writers'], inplace= True)

### Names File

In [13]:
df_names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0027125,tt0050419,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0075213,tt0038355,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0049189,tt0054452,tt0056404,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976"


In [14]:
# Rename column of directors for later merge
df_names = df_names.rename(columns={'nconst':'directors'})

In [15]:
df_names.drop(columns=['deathYear', 'primaryProfession', 'knownForTitles'], inplace= True)

In [16]:
df_names.head()

Unnamed: 0,directors,primaryName,birthYear
0,nm0000001,Fred Astaire,1899
1,nm0000002,Lauren Bacall,1924
2,nm0000003,Brigitte Bardot,1934
3,nm0000004,John Belushi,1949
4,nm0000005,Ingmar Bergman,1918


### Merging crew and names file

In [17]:
df_crew= df_crew.merge(df_names, how='left', on='directors')

In [18]:
df_crew.head()

Unnamed: 0,tconst,directors,primaryName,birthYear
0,tt0000001,nm0005690,William K.L. Dickson,1860
1,tt0000002,nm0721526,Émile Reynaud,1844
2,tt0000003,nm0721526,Émile Reynaud,1844
3,tt0000004,nm0721526,Émile Reynaud,1844
4,tt0000005,nm0005690,William K.L. Dickson,1860


In [19]:
df_crew.iloc[-1]

tconst                   tt9916880
directors      nm0584014,nm0996406
primaryName                    NaN
birthYear                      NaN
Name: 10547109, dtype: object

In [20]:
# For cases with more than one director, keep the first
df_crew['directors'] = df_crew['directors'].apply(lambda x: x.split(',')[0] if ',' in str(x) else x)

### Merging Basics, Ratings and Crew Files

In [21]:
len(df_basics)

306532

In [22]:
df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2014
1,tt0000002,5.7,270
2,tt0000003,6.5,1937
3,tt0000004,5.5,178
4,tt0000005,6.2,2712


In [23]:
df_ratings.dtypes

tconst            object
averageRating    float64
numVotes           int64
dtype: object

In [24]:
df_merge = df_basics.merge(df_ratings, how='left', on='tconst')

In [25]:
len(df_merge)

306532

In [26]:
df_merge = df_merge.merge(df_crew, how='left', on='tconst')

In [27]:
df_merge.head()

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,directors,primaryName,birthYear
0,tt0013274,movie,Istoriya grazhdanskoy voyny,0,2021,94,Documentary,6.8,63.0,nm0412842,,
1,tt0015724,movie,Dama de noche,0,1993,102,"Drama,Mystery,Romance",6.1,28.0,nm0529960,Eva López Sánchez,1954.0
2,tt0028248,movie,Shipmates o' Mine,0,2022,87,Musical,4.2,42.0,nm0593632,Oswald Mitchell,1897.0
3,tt0035423,movie,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance",6.4,88385.0,nm0003506,James Mangold,1963.0
4,tt0036606,movie,"Another Time, Another Place",0,1983,118,"Drama,War",6.4,350.0,nm0705535,Michael Radford,1946.0


In [29]:
# Save to csv
df_merge.to_csv(path_or_buf=os.path.join(clean_dir,'clean_df.csv'), index=False)