## Data Loading

In [5]:
import pandas as pd
pd.set_option("display.max_columns", None)

In [6]:
df_1 = pd.read_csv(r'..\data\19-07-2025_1000movies_df')
df_2 = pd.read_csv(r'..\data\19-07-2025_imdb_movies_series_df')

## Summarized Wrangling

In [7]:
# Making all steps into a functions

def clean_df_1(data: pd.DataFrame) -> pd.DataFrame:
    # Renaming columns
    df_1_new = data.rename({
                    'Series_Title':'title', 'Overview':'description', 
                    'IMDB_Rating': 'rating', 'No_of_Votes':'votes_count',
                    'Released_Year': 'release_year', 'Gross': 'total_sales',
                    'Runtime':'runtime', 'Genre':'genres', 'Director': 'director'
                }, axis=1)
    
    # Creating a new column
    df_1_new.loc[:,'type'] = 'movie'

    return df_1_new


def clean_df_2(data: pd.DataFrame) -> pd.DataFrame:

    # Renaming choosen columns.
    df_2_new = data.rename({
                    'originalTitle':'title', 'averageRating': 'rating',
                    'numVotes':'votes_count',
                    'grossWorldwide': 'total_sales',
                    'runtimeMinutes':'runtime',
                    'startYear': 'release_year'
                }, axis=1)
    
    # Dropping Duplicates
    df_2_dup_dropped = df_2_new.drop_duplicates().reset_index().drop('index', axis=1)

    # Converting list rows in genre column to string
    df_2_dup_dropped['genres'] = df_2_dup_dropped['genres'].str.replace(r'\[|\]|\'','', regex=True)

    # Creating a new column
    df_2_dup_dropped.loc[:,'director'] = None

    return df_2_dup_dropped


def join_dfs(df_1000: pd.DataFrame, df_imdb: pd.DataFrame) -> pd.DataFrame:
    """
    Joining the two cleaned data for further transformation

    Returns:
        pd.DataFrame: Joined dataframe
    """
    try:
        data_1 = clean_df_1(data=df_1000)
        data_2 = clean_df_2(data=df_imdb)

        frames = [data_1, data_2]
        new_df = pd.concat(
            frames, join='inner', ignore_index=True
            ).drop_duplicates(
                subset='title'
                ).reset_index(
                    drop=True
                    )
        return new_df
    except Exception as e:
        print("error raised: ", e)


## Data Preview

In [9]:
joined_df = join_dfs(df_1, df_2)
joined_df.to_csv("../data/joined_df.csv")

In [14]:
joined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1692 entries, 0 to 1691
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1692 non-null   int64  
 1   title         1692 non-null   object 
 2   release_year  1692 non-null   object 
 3   runtime       1364 non-null   object 
 4   genres        1692 non-null   object 
 5   rating        1673 non-null   float64
 6   description   1689 non-null   object 
 7   director      999 non-null    object 
 8   votes_count   1692 non-null   int64  
 9   total_sales   1050 non-null   object 
 10  type          1692 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 145.5+ KB


In [15]:
joined_df.head(2)

Unnamed: 0.1,Unnamed: 0,title,release_year,runtime,genres,rating,description,director,votes_count,total_sales,type
0,0,The Shawshank Redemption,1994,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,Frank Darabont,2343110,28341469,movie
1,1,The Godfather,1972,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,1620367,134966411,movie
