# Checkpoint Three: Cleaning Data

Now you are ready to clean your data. Before starting coding, provide the link to your dataset below.

My dataset:

Import the necessary libraries and create your dataframe(s).

In [14]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np



: 

## Missing Data

Test your dataset for missing data and handle it as needed. Make notes in the form of code comments as to your thought process.

In [None]:
movies_total_gross = pd.read_csv('disney_movies_total_gross.csv')
disney_revenue = pd.read_csv('disney_revenue_1991-2016.csv')
characters = pd.read_csv('Disney_Characters_cleaned.csv')

disney_main_df = pd.concat([movies_total_gross,disney_revenue,characters],ignore_index=True)

print(disney_main_df.head)
print(disney_main_df.shape)


In [None]:
duplicates = disney_main_df[disney_main_df.duplicated(keep=False)]
print(duplicates)

In [None]:
missing_data = disney_main_df.isnull().sum()

#printing missingdata in each column
# Already know these numbers wil be very high from EDA as the majority of the data is string or object datatypes and or needs formatting updates as part of the cleaning process. 
print(missing_data[missing_data>0])

In [None]:
# Check the shape (number of rows) of each DataFrame
print("movies_total_gross shape:", movies_total_gross.shape[0])
print("disney_revenue shape:", disney_revenue.shape[0])
print("characters shape:", characters.shape[0])



In [None]:
disney_main_df['total_gross'] = (
    disney_main_df['total_gross']
    .str.replace('$','',regex=False)
    .str.replace(',','',regex=False)
    .astype(float)
)

print(disney_main_df['total_gross'])

In [None]:
#convert object data type to string

disney_main_df['inflation_adjusted_gross'] = disney_main_df['inflation_adjusted_gross'].astype(str)

#strip dollarsigns and commas to make data more uniform
disney_main_df['inflation_adjusted_gross'] = (
    disney_main_df['inflation_adjusted_gross']
    .str.replace('$','',regex=False)
    .str.replace(',','',regex=False)
    .astype(float)
)

# convert the float datatype to numeric

disney_main_df['inflation_adjusted_gross']=pd.to_numeric(disney_main_df['inflation_adjusted_gross'],errors='coerce')
# Set display options for pandas
pd.set_option('display.float_format', '{:,.2f}'.format)
print(disney_main_df['inflation_adjusted_gross'])




In [None]:

# Select rows 579 to 661
rows_to_clean = disney_main_df.iloc[579:661]

# Drop NaN values from the specified columns
cleaned_rows = rows_to_clean.dropna(subset=[
    'movie_title', 'release_date', 'genre', 'MPAA_rating',
    'total_gross', 'inflation_adjusted_gross', 'Year', 
    'Studio Entertainment[NI 1]'
])

# Display the cleaned DataFrame
print(cleaned_rows.to_string(index=False))


In [None]:
#remove NaN Values from rows 570 -661 as those values do not exist in the speciified columns since the csv. are different lengths and contaion different data
# Select rows 579 to 661
rows_to_clean = disney_main_df.iloc[579:661]

# Check the original number of rows
original_row_count = rows_to_clean.shape[0]
print(f"Original row count: {original_row_count}")

# Drop NaN values from the specified columns
cleaned_rows = rows_to_clean.dropna(subset=[
    'movie_title', 'release_date', 'genre', 'MPAA_rating',
    'total_gross', 'inflation_adjusted_gross', 'Year', 
    'Studio Entertainment[NI 1]'
])

# Check the new number of rows
new_row_count = cleaned_rows.shape[0]
print(f"New row count after dropping NaNs: {new_row_count}")

# Check for remaining NaN values in the cleaned DataFrame
nan_counts = cleaned_rows.isna().sum()
print("\nRemaining NaN values in cleaned DataFrame:")
print(nan_counts)

# Display the cleaned DataFrame
print("\nCleaned DataFrame:")
print(cleaned_rows.to_string(index=False))


In [None]:
#check column which contains nan values up to 579 but has daa after that
print(disney_main_df['hero'])

## Irregular Data

Detect outliers in your dataset and handle them as needed. Use code comments to make notes about your thought process.

In [None]:
import numpy as np
# Analyze rows 0-580 as that will contatin the bulk of the numerical data. 
rows_to_analyze = disney_main_df.iloc[0:580]

# describe to see basic statitsics
print(rows_to_analyze.describe())

In [None]:

#load dat frmt he above desribed stats
data = {
    'total_gross': [0, 12788864, 30702446, 75709033, 936662225],
    'inflation_adjusted_gross': [0, 22741232, 55159783, 119202000, 5228953251]
}

disney_main_df = pd.DataFrame(data)

# Calculate IQR for 'total_gross'
Q1_gross = disney_main_df['total_gross'].quantile(0.25)
Q3_gross = disney_main_df['total_gross'].quantile(0.75)
IQR_gross = Q3_gross - Q1_gross

# Outlier bounds for 'total_gross'
lower_bound_gross = Q1_gross - 1.5 * IQR_gross
upper_bound_gross = Q3_gross + 1.5 * IQR_gross

# outliers in 'total_gross'
outliers_gross = disney_main_df[(disney_main_df['total_gross'] < lower_bound_gross) | (disney_main_df['total_gross'] > upper_bound_gross)]
print("Outliers in 'total_gross':")
print(outliers_gross)

# Calculate IQR for 'inflation_adjusted_gross'
Q1_adjusted = disney_main_df['inflation_adjusted_gross'].quantile(0.25)
Q3_adjusted = disney_main_df['inflation_adjusted_gross'].quantile(0.75)
IQR_adjusted = Q3_adjusted - Q1_adjusted

# Outlier bounds for 'inflation_adjusted_gross'
lower_bound_adjusted = Q1_adjusted - 1.5 * IQR_adjusted
upper_bound_adjusted = Q3_adjusted + 1.5 * IQR_adjusted

# outliers in 'inflation_adjusted_gross'
outliers_adjusted = disney_main_df[(disney_main_df['inflation_adjusted_gross'] < lower_bound_adjusted) | (disney_main_df['inflation_adjusted_gross'] > upper_bound_adjusted)]
print("Outliers in 'inflation_adjusted_gross':")
print(outliers_adjusted)


## Unnecessary Data

Look for the different types of unnecessary data in your dataset and address it as needed. Make sure to use code comments to illustrate your thought process.

In [None]:
# Not using the disney-director or disney-voice-actor CSVs as I have determined that information will not be relevant to this specifiv business question

## Inconsistent Data

Check for inconsistent data and address any that arises. As always, use code comments to illustrate your thought process.

In [None]:

movies_total_gross = pd.read_csv('disney_movies_total_gross.csv')
disney_revenue = pd.read_csv('disney_revenue_1991-2016.csv')
characters = pd.read_csv('Disney_Characters_cleaned.csv')

disney_main_df = pd.concat([movies_total_gross,disney_revenue,characters],ignore_index=True)

# object columns to convert to strings
object_columns = [
    'movie_title', 'release_date', 'genre', 'MPAA_rating', 
    'total_gross', 'inflation_adjusted_gross', 'Disney Media Networks', 
    'hero', 'villian', 'song', 
]

# Convert object columns to strings and handle NaN values
for col in object_columns:
    disney_main_df[col] = disney_main_df[col].astype(str).replace('nan', '').str.strip()

# Fill missing values with 'Unknown' for genre and 'Not Rated' for MPAA_rating
disney_main_df['genre'] = disney_main_df['genre'].fillna('Unknown')
disney_main_df['MPAA_rating'] = disney_main_df['MPAA_rating'].fillna('Not Rated')


duplicates = disney_main_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Check for unique values in categorical columns
for col in ['genre', 'MPAA_rating']:
    print(f"Unique values in {col}: {disney_main_df[col].unique()}")

print(disney_main_df.info())

In [None]:
# View unique values in the columns
print(disney_main_df['total_gross'].unique())
print(disney_main_df['inflation_adjusted_gross'].unique())


In [18]:
# Replace empty strings with NaN
disney_main_df['total_gross'] = disney_main_df['total_gross'].replace('', np.nan)
disney_main_df['inflation_adjusted_gross'] = disney_main_df['inflation_adjusted_gross'].replace('', np.nan)



In [20]:
# Convert to float after cleaning up the strings
disney_main_df['total_gross'] = disney_main_df['total_gross'].replace({'$': '', ',': ''}, regex=True).astype(float)
disney_main_df['inflation_adjusted_gross'] = disney_main_df['inflation_adjusted_gross'].replace({'$': '', ',': ''}, regex=True).astype(float)


In [21]:
# Fill NaN values with 0
disney_main_df['total_gross'] = disney_main_df['total_gross'].fillna(0)
disney_main_df['inflation_adjusted_gross'] = disney_main_df['inflation_adjusted_gross'].fillna(0)


## Summarize Your Results

Make note of your answers to the following questions.

1. Did you find all four types of dirty data in your dataset?
    No I found 3 of 4 there was no duplicate data that I could find.
2. Did the process of cleaning your data give you new insights into your dataset?
    Yes it helped me to elimnate to CSV's that were not useful and brought down the toal row count. It also showed me the structure in a more understanabdle way allowing e to see why there were so many rows in the first place. 
3. Is there anything you would like to make note of when it comes to manipulating the data and making visualizations?
    I will defintiely be considering total gross nad adjusted gross in comparison to year of release and character in film to help make recommendations for merchandise and characters. 