<a href="https://colab.research.google.com/github/MusicBee/BAN6420_Milestone_Assignment_1/blob/main/Netflix_Data_Visualisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
### IMPORT RELATED LIBRARIES TO PERFORM TASKS
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os


In [29]:
### EXTRACT ZIP FILE AND RENAME FILE
def extract_and_rename(zip_path, extract_to, new_filename):
    """
    Extracts the first CSV file from a zip archive and renames it.

    Parameters:
    zip_path (str): Path to the zip file.
    extract_to (str): Directory where files will be extracted.
    new_filename (str): New name for the extracted CSV file (with .csv extension).
    """
    # Ensure the extraction directory exists
    os.makedirs(extract_to, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # List all files in the zip
        file_list = zip_ref.namelist()

        # Filter only CSV files
        csv_files = [f for f in file_list if f.endswith('.csv')]

        if not csv_files:
            raise FileNotFoundError("No CSV file found in the zip archive.")

        # Extract the first CSV
        original_csv = csv_files[0]
        zip_ref.extract(original_csv, extract_to)

        # Define old and new paths
        old_path = os.path.join(extract_to, original_csv)
        new_path = os.path.join(extract_to, new_filename)

        # Rename the file
        os.rename(old_path, new_path)
        print(f"Extracted and renamed {original_csv} → {new_filename}")



In [44]:
### PREPARE AND ANALYZE THE EXTRACTED FILE
def prepare_and_analyze(zip_file, output_dir="output", renamed="netflix_shows_movies.csv"):


    ### Create an output directory
    output_dir = Path(output_dir)
    plots_dir = output_dir / "plots"
    output_dir.mkdir(parents=True, exist_ok=True)
    plots_dir.mkdir(parents=True, exist_ok=True)
    print(f"Created output directory: {output_dir}")
    print(f"Created plots directory: {plots_dir}")

    ### Unzipped and Extracted the netflix_data.zip
    with zipfile.ZipFile(zip_file, 'r') as zf:
        zf.extractall(output_dir)
    print(f"Extracted {zip_file} to {output_dir}")

    ### Function to rename extracted file to Netflix_Shows_Movies.csv
    extract_and_rename(zip_file, output_dir, renamed)

    # Find the CSV file within the extracted data
    csv_files = list(output_dir.glob("**/*.csv")) # Search for CSV recursively
    if not csv_files:
       raise FileNotFoundError(f"No CSV file found in extracted data within {output_dir}.")
    data_file = csv_files[0] # Use the first CSV file found





    ### READ FILE INTO DATA FRAME FOR CLEASING
    df = pd.read_csv(data_file)

    # Remove Duplicates and Fillin the Missing Values
    df.drop_duplicates(inplace=True)
    for col in df.select_dtypes(include=["object"]).columns:
        df[col].fillna("Unknown", inplace=True)
    for col in df.select_dtypes(include=["number"]).columns:
        df[col].fillna(df[col].mean(), inplace=True)

    cleaned_file = output_dir / "cleaned_netflix_data.csv"
    df.to_csv(cleaned_file, index=False)
    print(f"Saved cleaned dataset at {cleaned_file}")






    ### DATA EXPLORATION
    summary_file = output_dir / "summary.txt"
    with open(summary_file, "w") as f:
        f.write("### Data Info ###\n\n")
        df.info(buf=f)
        f.write("\n\n### Description ###\n\n")
        f.write(str(df.describe(include="all")))
    print(f"Summary saved at {summary_file}")







    ### DATA VISUALISATION
    # Check if 'genre' or 'listed_in' column exists for visualization
    genre_column = None
    if "genre" in df.columns:
        genre_column = "genre"
    elif "listed_in" in df.columns:
        genre_column = "listed_in"

    if genre_column:
        plt.figure(figsize=(10,6))
        # Split genres if 'listed_in' is used as it might contain multiple genres
        if genre_column == "listed_in":
            genres = df[genre_column].str.split(', ').explode()
            genres.value_counts().head(10).plot(kind="bar", color="skyblue")
        else:
            df[genre_column].value_counts().head(10).plot(kind="bar", color="skyblue")

        plt.title(f"Top 10 Most Watched {genre_column.replace('_',' ').title()}")
        plt.xlabel(genre_column.replace('_',' ').title())
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(plots_dir / f"most_watched_{genre_column}.png")
        plt.close()


    if "rating" in df.columns:
        plt.figure(figsize=(10,6))
        sns.countplot(x="rating", data=df, palette="Set2",
                      order=df["rating"].value_counts().index)
        plt.title("Ratings Distribution")
        plt.xlabel("Rating")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(plots_dir / "ratings_distribution.png")
        plt.close()



In [None]:
### USAGE
dataset_path = "/content/netflix/netflix_data.zip"
prepare_and_analyze(dataset_path, output_dir="/content/netflix/myoutput")