In [1]:
import pandas as pd



In [2]:
# Load the data into Pandas DataFrames
basics_df = pd.read_csv('/Users/lidiv/project03/title.basics.tsv.gz', sep='\t', low_memory=False)
ratings_df = pd.read_csv('/Users/lidiv/project03/title.ratings.tsv.gz', sep='\t', low_memory=False)
akas_df = pd.read_csv('/Users/lidiv/project03/title.akas.tsv.gz', sep='\t', low_memory=False)



In [3]:
print(akas_df.columns)
print(basics_df.columns)
print(ratings_df)

Index(['titleId', 'ordering', 'title', 'region', 'language', 'types',
       'attributes', 'isOriginalTitle'],
      dtype='object')
Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')
            tconst  averageRating  numVotes
0        tt0000001            5.7      1993
1        tt0000002            5.8       268
2        tt0000003            6.5      1879
3        tt0000004            5.5       177
4        tt0000005            6.2      2663
...            ...            ...       ...
1351315  tt9916730            8.3        10
1351316  tt9916766            7.0        22
1351317  tt9916778            7.2        36
1351318  tt9916840            8.8         6
1351319  tt9916880            8.2         6

[1351320 rows x 3 columns]


In [4]:
# Include only US movies
us_movies_df = akas_df[akas_df['region'] == 'US']


In [9]:
# Convert 'startYear' column in 'basics_df' to integers
basics_df['startYear'] = pd.to_numeric(basics_df['startYear'], errors='coerce')

In [10]:
# Filter 'basics_df' for movies released between 2000 and 2021 (inclusive)
us_movies_df = us_movies_df[us_movies_df['titleId'].isin(basics_df[(basics_df['startYear'] >= 2000) & (basics_df['startYear'] <= 2021)]['tconst'])]


In [13]:
# Filter for full-length movies (titleType = "movie")
us_movies_df = us_movies_df[us_movies_df['titleId'].isin(basics_df[basics_df['titleType'] == 'movie']['tconst'])]



In [16]:
# Print the column names in your DataFrame
print(us_movies_df.columns)

Index(['titleId', 'ordering', 'title', 'region', 'language', 'types',
       'attributes', 'isOriginalTitle'],
      dtype='object')


In [17]:
# Drop rows with missing values for 'genres' or 'runtimeMinutes' in 'basics_df'
basics_df = basics_df.dropna(subset=['genres', 'runtimeMinutes'])

In [18]:
# Filter 'akas_df' for movies released in the United States
us_movies_df = akas_df[akas_df['region'] == 'US']


In [19]:
# Filter out movies with missing genre or runtime
us_movies_df = us_movies_df[us_movies_df['titleId'].isin(basics_df['tconst'])]

In [20]:
# Exclude movies from the Documentary genre
us_movies_df = us_movies_df[~us_movies_df['titleId'].isin(basics_df[basics_df['genres'].str.contains('Documentary')]['tconst'])]

In [21]:
us_movies_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
42,tt0000005,7,Blacksmithing,US,\N,\N,informal alternative title,0


In [22]:
us_movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1278300 entries, 14 to 37255016
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1278300 non-null  object
 1   ordering         1278300 non-null  int64 
 2   title            1278300 non-null  object
 3   region           1278300 non-null  object
 4   language         1278300 non-null  object
 5   types            1278300 non-null  object
 6   attributes       1278300 non-null  object
 7   isOriginalTitle  1278300 non-null  object
dtypes: int64(1), object(7)
memory usage: 87.8+ MB


In [14]:

# Save the filtered DataFrame as a gzip-compressed CSV file
us_movies_df.to_csv('us_movies_filtered.csv.gz', compression='gzip', index=False)
