# Netflix Data: Cleaning, Analysis, and Visualization (Beginner ML Project)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = '/mnt/data/netflix1.csv'
df = pd.read_csv(file_path)


In [None]:
# Display basic information
display(df.head())
display(df.info())
display(df.describe())


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
display(missing_values[missing_values > 0])


In [None]:
# Handling missing values (Example: Filling with median for numerical, mode for categorical)
for column in df.select_dtypes(include=['number']).columns:
    df[column].fillna(df[column].median(), inplace=True)

for column in df.select_dtypes(include=['object']).columns:
    df[column].fillna(df[column].mode()[0], inplace=True)


In [None]:
# Verify missing values handled
display(df.isnull().sum().sum())


In [None]:
# Summary statistics
numeric_columns = df.select_dtypes(include=[np.number]).columns
display(df[numeric_columns].describe())


In [None]:
# Data visualization
plt.figure(figsize=(10, 5))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Histograms
df.hist(figsize=(12, 10), bins=30)
plt.show()


In [None]:
# Boxplots to check for outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[numeric_columns])
plt.xticks(rotation=90)
plt.title('Boxplot of Numerical Features')
plt.show()


In [None]:
# Countplots for categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    plt.figure(figsize=(10, 4))
    sns.countplot(y=df[column], order=df[column].value_counts().index)
    plt.title(f'Count Plot for {column}')
    plt.show()


In [None]:
# Insights & Summary
print("\nNetflix Data Analysis Summary:")
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
print("No missing values remaining after preprocessing.")
print("Data visualizations provide insights on distributions, correlations, and outliers.")
