In [1]:
import pandas as pd
import numpy as np

# Load dataset
file_path = r"E:\NULLCLASS\datasets\Play Store Data.csv"  # Update this if needed
apps_df = pd.read_csv(file_path)

# ==============================
# 1️⃣ Convert Columns to Proper Data Types
# ==============================

# Convert 'Reviews' to integer
apps_df["Reviews"] = pd.to_numeric(apps_df["Reviews"], errors="coerce")

# Process 'Installs' column (remove '+' and ',' and convert to integer)
apps_df["Installs"] = (
    apps_df["Installs"]
    .astype(str)
    .str.replace(",", "", regex=True)  # Remove commas
    .str.replace(r"\+", "", regex=True)  # Remove '+'
    .replace("Free", "0")  # Convert 'Free' to 0
)

# Convert 'Installs' to numeric and handle errors
apps_df["Installs"] = pd.to_numeric(apps_df["Installs"], errors="coerce").fillna(0).astype(int)

# Process 'Price' column (remove '$' and handle errors)
apps_df["Price"] = (
    apps_df["Price"]
    .astype(str)
    .str.replace("$", "", regex=False)  # Remove dollar sign
    .replace("Everyone", np.nan)  # Remove incorrect values
    .replace("", "0")  # Replace empty values with 0
)

# Remove rows where 'Price' is still non-numeric
apps_df = apps_df[pd.to_numeric(apps_df["Price"], errors="coerce").notna()]

# Convert 'Price' to float
apps_df["Price"] = apps_df["Price"].astype(float)

# ==============================
# 2️⃣ Handle Missing Values
# ==============================

# Drop rows where key columns are missing
apps_df.dropna(subset=["Category", "Reviews", "Rating", "Installs", "Type", "Size", "Price"], inplace=True)

# ==============================
# 3️⃣ Convert 'Size' to Numeric (MB)
# ==============================

def convert_size(size):
    """Convert size values to MB"""
    if isinstance(size, str):
        if "M" in size:
            return float(size.replace("M", ""))
        elif "K" in size:
            return float(size.replace("K", "")) / 1024  # Convert KB to MB
    return np.nan  # Assign NaN if size is missing

apps_df["Size"] = apps_df["Size"].apply(convert_size)

# ==============================
# 4️⃣ Convert 'Last Updated' to Date Format
# ==============================

apps_df["Last Updated"] = pd.to_datetime(apps_df["Last Updated"], errors="coerce")
apps_df.dropna(subset=["Last Updated"], inplace=True)

# ==============================
# 5️⃣ Save the Cleaned Data
# ==============================

apps_df.to_csv("cleaned_data.csv", index=False)
print("✅ Cleaned dataset saved as 'cleaned_data.csv'!")
print("Final dataset shape:", apps_df.shape)
print(apps_df.head())


✅ Cleaned dataset saved as 'cleaned_data.csv'!
Final dataset shape: (9366, 13)
                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

    Reviews  Size  Installs  Type  Price Content Rating  \
0     159.0  19.0     10000  Free    0.0       Everyone   
1     967.0  14.0    500000  Free    0.0       Everyone   
2   87510.0   8.7   5000000  Free    0.0       Everyone   
3  215644.0  25.0  50000000  Free    0.0           Teen   
4     967.0   2.8    100000  Free    0.0       Everyone   

                      Genres Last Updated         Current Ver   Android Ver  
0      

In [7]:
import plotly.express as px

# Load cleaned data
df = pd.read_csv("cleaned_data.csv")

# Example: Generate a bar chart and save it as an HTML file
fig = px.bar(df, x="Category", y="Installs", title="Installs by Category")
fig.write_html("installs_by_category.html")
print("✅ Graph saved as 'installs_by_category.html'")


✅ Graph saved as 'installs_by_category.html'
