In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
import pandas as pd

# ✅ Mount Google Drive
drive.mount('/content/drive')

# ✅ Path to your CSV file inside Drive
file_path = '/content/drive/MyDrive/Indian_Kids_Screen_Time.csv'

# ✅ Read CSV
df = pd.read_csv(file_path)

In [None]:
print("First 5 rows:\n", df.head())
print("\nLast 5 rows:\n", df.tail())
print("\nRandom 5 rows:\n", df.sample(5))


In [None]:
print("\nShape (rows, cols):", df.shape)
print("\nColumn Names:\n", df.columns)
print("\nData Types:\n", df.dtypes)
print("\nFull Info:")
print(df.info())

In [None]:
# Numeric + categorical summary
print("\nSummary Stats:\n", df.describe(include="all"))

In [None]:
print("\nMean:\n", df.mean(numeric_only=True))
print("\nMedian:\n", df.median(numeric_only=True))
print("\nMode:\n", df.mode().iloc[0])  # first mode row

In [None]:
print("\nMean:\n", df.mean(numeric_only=True))
print("\nMedian:\n", df.median(numeric_only=True))
print("\nMode:\n", df.mode().iloc[0])  # first mode row

In [None]:
# Check null values in each column
print("Null values in each column:")
print(df.isnull().sum())

# Check if any null exists in the dataset
print("\nAny null values in dataset?")
print(df.isnull().any().any())

In [None]:
# Rows containing at least one null
print("Rows with missing values:")
print(df[df.isnull().any(axis=1)])

In [None]:
# iloc - by position
print(df.iloc[2])        # Third row
print(df.iloc[1:4, 0:2]) # Rows 1-3, first 2 columns

# loc - by labels (use exact column names!)
df.columns = df.columns.str.replace(" ", "_")  # Standardize
print(df.loc[:3, ["Age", "Avg_Daily_Screen_Time_hr"]])

# Column slicing
print(df["Age"][:3])

In [None]:
# Single column slice
print(df["Age"][:3])

# Multiple columns slice using loc
print(df.loc[:3, ["Age", "Gender"]])

In [None]:
# Kids with more than 5 hours screen time
high_screen_time = df[df["Avg_Daily_Screen_Time_hr"] > 5]
print(high_screen_time)

In [None]:
df['Gender'] = df['Gender'].astype('category')
df['Urban_or_Rural'] = df['Urban_or_Rural'].astype('category')
print(df.dtypes)

In [None]:
print(df['Gender'].cat.categories)
print(df['Urban_or_Rural'].cat.categories)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.histplot(df['Avg_Daily_Screen_Time_hr'], kde=True, color="brown")
plt.title("distribution of screen time in hours")
plt.show()

In [None]:
!pip install seaborn matplotlib

In [None]:
#Countplot for gender
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='Gender')
plt.title('Count of Kids by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# ✅ Gender Countplot
plt.figure(figsize=(6,4))
sns.countplot(x='Gender', data=df, palette='Set2')
plt.title("Gender Count")
plt.show()

# ✅ Location Countplot
plt.figure(figsize=(8,5))
sns.countplot(x='Urban_or_Rural', data=df, palette='Set3')
plt.title("Location Count")
plt.xticks(rotation=45)  # rotate labels if they are long
plt.show()

In [None]:
#countplot for device type
plt.figure(figsize=(8,4))
sns.countplot(data=df, x='Primary_Device')
plt.title('Count of Kids by Device Type')
plt.xlabel('Device Type')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate labels if they overlap
plt.show()

In [None]:
# 🎻 Violin plot for Screen Time by Gender with custom colors
plt.figure(figsize=(6,4))
sns.violinplot(
    data=df,
    x='Gender',
    y='Avg_Daily_Screen_Time_hr',
    palette=["#1f77b4", "#ff7f0e"]  # blue, orange
)
plt.title('Daily Screen Time Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Avg Daily Screen Time (hrs)')
plt.show()

In [None]:
#Violin plot for Screen Time by Age
plt.figure(figsize=(12,6))
sns.violinplot(
    data=df,
    x='Age',
    y='Avg_Daily_Screen_Time_hr',
    palette="coolwarm",   # change palette here
    inner="quartile"      # show quartile lines
)
plt.title('Daily Screen Time Distribution by Age', fontsize=14)
plt.xlabel('Age')
plt.ylabel('Avg Daily Screen Time (hrs)')
plt.xticks(rotation=45)  # rotate if age values overlap
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(
    x="Age",
    y="Avg_Daily_Screen_Time_hr",
    data=df,
    palette="viridis",   # 🔥 new color palette
    notch=True,          # adds notch for median confidence interval
    width=0.6            # narrower boxes for clarity
)

# overlay individual data points
sns.swarmplot(
    x="Age",
    y="Avg_Daily_Screen_Time_hr",
    data=df,
    color="black",       # points color
    alpha=0.6,           # transparency
    size=3
)

plt.title("Boxplot of Daily Screen Time by Age", fontsize=16, fontweight='bold')
plt.xlabel("Age", fontsize=12)
plt.ylabel("Average Daily Screen Time (hrs)", fontsize=12)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Boxplot: Screen Time by Gender
plt.figure(figsize=(8, 6))

# Boxplot with new style
sns.boxplot(
    x="Gender",
    y="Avg_Daily_Screen_Time_hr",
    data=df,
    palette=["#66c2a5", "#fc8d62"],  # custom colors
    notch=True,
    width=0.5
)

# Overlay data points
sns.swarmplot(
    x="Gender",
    y="Avg_Daily_Screen_Time_hr",
    data=df,
    color="black",
    alpha=0.6,
    size=3
)

plt.title("Boxplot of Daily Screen Time by Gender", fontsize=16, fontweight="bold")
plt.xlabel("Gender", fontsize=12)
plt.ylabel("Average Daily Screen Time (hrs)", fontsize=12)
plt.show()

In [None]:
#histogram for continuos variables
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set(style="whitegrid")

# Continuous variables to plot
continuous_vars = ["Avg_Daily_Screen_Time_hr", "Age"]

for col in continuous_vars:
    plt.figure(figsize=(8,5))
    sns.histplot(df[col], kde=True, bins=20, color="#69b3a2", alpha=0.6)
    plt.title(f"Histogram & KDE of {col}", fontsize=14)
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
#KDE plots
for col in continuous_vars:
    plt.figure(figsize=(8,5))
    sns.kdeplot(data=df, x=col, hue="Gender", fill=True, alpha=0.5)
    plt.title(f"KDE of {col} by Gender", fontsize=14)
    plt.xlabel(col)
    plt.ylabel("Density")
    plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='Age', y='Avg_Daily_Screen_Time_hr', hue='Gender', s=90)
plt.title("Scatterplot: Age vs Average Daily Screen Time", fontsize=14)
plt.xlabel("Age (years)")
plt.ylabel("Average Daily Screen Time (hours)")
plt.grid(True)
plt.show()