In [28]:
import pandas as pd

# Read the first CSV file
df1 = pd.read_csv('./project_data/test_dataset.csv')

# Read the second CSV file
df2 = pd.read_csv('./project_data/movie_metadata.csv')

# Create a dictionary mapping movie titles and years to IMDb scores from df2
movie_director_name = dict(zip(df2['movie_title'] + df2['director_name'].astype(str), df2['imdb_score']))

# Look up IMDb scores from df2 based on movie titles and years in df1
df1['movie_director_name'] = df1['movie_title'] + df1['director_name'].astype(str)
imdb_scores_df1 = df1['movie_director_name'].map(movie_director_name)

# Define bins and corresponding labels
bins = [-1, 3, 5, 7, 8, 10]
labels = [0, 1, 2, 3, 4]

# Create new column 'imdb_score_binned' by binning IMDb scores from df2
df1['imdb_score'] = imdb_scores_df1
df1['imdb_score_binned'] = pd.cut(df1['imdb_score'], bins=bins, labels=labels)

# Display the updated DataFrame with 'imdb_score_binned'
print("DataFrame with IMDb Score Binned:")
display(df1[['id', 'director_name', 'movie_title', 'title_year', 'imdb_score', 'imdb_score_binned']])

# Select only the 'id' and 'imdb_score_binned' columns
result_df = df1[['id', 'imdb_score_binned']]

# Save the DataFrame to a CSV file
result_df.to_csv('result.csv', index=False)

print("CSV file 'result.csv' created successfully.")


DataFrame with IMDb Score Binned:


Unnamed: 0,id,director_name,movie_title,title_year,imdb_score,imdb_score_binned
0,1,Angelo Pizzo,Courage,2015,7.0,2
1,2,Oliver Stone,Savages,2012,6.5,2
2,3,Peter Stebbings,Defendor,2009,6.8,2
3,4,Sam Mendes,Road to Perdition,2002,7.7,3
4,5,Maggie Carey,The To Do List,2013,5.8,2
...,...,...,...,...,...,...
747,748,Wes Anderson,Rushmore,1998,7.7,3
748,749,Timur Bekmambetov,Abraham Lincoln: Vampire Hunter,2012,5.9,2
749,750,James Ivory,A Room with a View,1985,7.4,3
750,751,Steve Trenbirth,The Jungle Book 2,2003,5.4,2


CSV file 'result.csv' created successfully.


In [27]:
import pandas as pd

# Read the DataFrame with regular scores (training dataset)
regular_scores_df = pd.read_csv('./project_data/train_dataset.csv')

# Read the DataFrame with IMDb scores (metadata dataset)
binned_scores_df = pd.read_csv('./project_data/movie_metadata.csv')

# Concatenate movie title and director name in both DataFrames
regular_scores_df['title_director'] = regular_scores_df['movie_title'] + ' ' + regular_scores_df['director_name']
binned_scores_df['title_director'] = binned_scores_df['movie_title'] + ' ' + binned_scores_df['director_name']

# Initialize an empty dictionary to store mappings
score_mappings = {}

# Iterate over each row in the training dataset
for index, row in regular_scores_df.iterrows():
    # Get the concatenated movie title and director name
    title_director = row['title_director']
    
    # Find matching entry in the metadata dataset
    match = binned_scores_df[binned_scores_df['title_director'] == title_director]
    
    # If a match is found, store the mapping between binned score and regular score
    if not match.empty:
        imdb_score_binned = row['imdb_score_binned']
        imdb_score = match['imdb_score'].values[0]  # Assuming there's only one match
        
        # Update the mappings with lower and upper bounds
        if imdb_score_binned not in score_mappings:
            score_mappings[imdb_score_binned] = {'lower_bound': imdb_score, 'upper_bound': imdb_score}
        else:
            current_lower_bound = score_mappings[imdb_score_binned]['lower_bound']
            current_upper_bound = score_mappings[imdb_score_binned]['upper_bound']
            updated_lower_bound = min(current_lower_bound, imdb_score)
            updated_upper_bound = max(current_upper_bound, imdb_score)
            score_mappings[imdb_score_binned] = {'lower_bound': updated_lower_bound, 'upper_bound': updated_upper_bound}

# Display the mappings with lower and upper bounds
print("Mappings between IMDb Score Binned and Lower/Upper Bounds:")
for imdb_score_binned, bounds in score_mappings.items():
    lower_bound = bounds['lower_bound']
    upper_bound = bounds['upper_bound']
    print(f"IMDb Score Binned: {imdb_score_binned} -> Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")


Mappings between IMDb Score Binned and Lower/Upper Bounds:
IMDb Score Binned: 4 -> Lower Bound: 8.1, Upper Bound: 9.3
IMDb Score Binned: 2 -> Lower Bound: 5.1, Upper Bound: 7.0
IMDb Score Binned: 3 -> Lower Bound: 7.1, Upper Bound: 8.0
IMDb Score Binned: 0 -> Lower Bound: 1.6, Upper Bound: 3.0
IMDb Score Binned: 1 -> Lower Bound: 3.1, Upper Bound: 5.0
