In [1]:
import pandas as pd

In [52]:
votesdf = pd.read_csv("tmdb_top_movies.csv")
ratingdf = pd.read_csv("movieDataset.csv")

In [53]:
#perform union on the two datasets
temp = votesdf[~votesdf['tconst'].isin(ratingdf['tconst'])]


In [54]:
mergedf = pd.concat([ratingdf, temp], ignore_index=True)


In [55]:
df = pd.get_dummies(mergedf, columns=['primaryGenre'], drop_first=True) #MOVE

In [56]:
# List of columns to convert to integers
int_columns = ['startYear']

for col in int_columns:
    # Convert to numeric, coerce errors to NaN, fill NaN with 0, then cast to int
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)


In [57]:
# Replace all "\\n", "NaN", and "null" entries with empty strings in the entire dataset
df.replace({'\n': ""}, regex=True, inplace=True)
df.replace(r'\\N', '0', regex=True)


# Additionally, replace any NaN (missing values) with empty strings
df.fillna("", inplace=True)
df['runtimeMinutes'] = df['runtimeMinutes'].replace(r'\\N', '0', regex=True)  # Replace with 0

In [58]:

def get_top_n_values(df, column, n=10):
    """
    Returns the top n most common (frequent) values from a specified column in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str): The name of the column to analyze.
        n (int): Number of top values to return.

    Returns:
        list: A list of the top n most common values.
    """
    return df[column].value_counts().head(n).index.tolist()


In [70]:
actor1_top100 = get_top_n_values(df, 'actor1', 20)
actor2_top100 = get_top_n_values(df, 'actor2', 20)
directors_top100 = get_top_n_values(df, 'directors', 10)

print(actor1_top100)

['', 'nm0000115', 'nm0000158', 'nm0000134', 'nm0001191', 'nm0000329', 'nm0000136', 'nm0000246', 'nm0000142', 'nm0000230', 'nm0000154', 'nm0000243', 'nm0000206', 'nm0000129', 'nm0000242', 'nm0000553', 'nm0000552', 'nm0425005', 'nm0000216', 'nm0000354']


In [61]:
def one_hot_top_values(df, column, top_values, prefix=None):
    """
    One-hot encodes a single column using only the provided top values.
    All other values are grouped into a single 'other' column.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str): The column to encode.
        top_values (list): List of values to keep as individual one-hot columns.
        prefix (str): Optional prefix for the new columns.

    Returns:
        pd.DataFrame: DataFrame with one-hot encoded columns (including 'other').
    """
    prefix = prefix or column
    one_hot_df = pd.DataFrame(0, index=df.index, columns=[f"{prefix}_{val}" for val in top_values] + [f"{prefix}_other"])

    for idx, value in df[column].items():
        col_name = f"{prefix}_{value}" if value in top_values else f"{prefix}_other"
        one_hot_df.at[idx, col_name] = 1

    return one_hot_df


In [71]:
one_hot = one_hot_top_values(df, 'actor1', actor1_top100, prefix='actor1')
df_encoded = pd.concat([df, one_hot], axis=1)
df_encoded = df_encoded.drop(columns=['actor1'])


In [72]:
one_hot = one_hot_top_values(df, 'actor2', actor2_top100, prefix='actor2')
df_encoded = pd.concat([df_encoded, one_hot], axis=1)
df_encoded = df_encoded.drop(columns=['actor2'])


In [73]:
one_hot = one_hot_top_values(df, 'directors', directors_top100, prefix='directors')
df_encoded = pd.concat([df_encoded, one_hot], axis=1)
df_encoded = df_encoded.drop(columns=['directors'])


In [74]:
df_encoded.head(5)

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,rating,primaryGenre_Adult,primaryGenre_Adventure,primaryGenre_Animation,primaryGenre_Biography,primaryGenre_Comedy,...,directors_nm0000095,directors_nm0000142,directors_nm0000217,directors_nm0000229,directors_nm0000033,directors_nm0000631,directors_nm0000165,directors_nm0001752,directors_nm0005062,directors_other
0,tt0111161,The Shawshank Redemption,1994,142,9.295191,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
1,tt0068646,The Godfather,1972,175,9.193354,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
2,tt0468569,The Dark Knight,2008,152,8.995661,False,False,False,False,False,...,0,0,0,0,0,0,0,0,0,1
3,tt0167260,The Lord of the Rings: The Return of the King,2003,201,8.993709,False,True,False,False,False,...,0,0,0,0,0,0,0,0,0,1
4,tt0108052,Schindler's List,1993,195,8.991416,False,False,False,True,False,...,0,0,0,1,0,0,0,0,0,0


In [75]:
df_encoded.to_csv('fullDataset.csv', index=False)