In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pytz

# Load dataset
apps_df = pd.read_csv(r"E:\NULLCLASS\Play Store Data.csv")

# Data Cleaning: Remove missing values in required columns
apps_df.dropna(subset=['Installs', 'Type', 'Category', 'Android Ver', 'Size', 'Content Rating', 'App', 'Price'], inplace=True)

# Convert 'Installs' to integer
apps_df['Installs'] = apps_df['Installs'].str.replace(',', '').str.replace('+', '', regex=False).astype(int)

# Convert 'Price' to numeric (removing $ sign if present)
apps_df['Price'] = apps_df['Price'].str.replace('$', '', regex=False).astype(float)

# Calculate Revenue
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

# Convert 'Size' to numeric (handling M and K values)
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', ''))
    elif 'K' in size:
        return float(size.replace('K', '')) / 1000  # Convert KB to MB
    return None  # Handle cases where size is not available

apps_df['Size'] = apps_df['Size'].apply(lambda x: convert_size(x) if isinstance(x, str) else None)

# Apply filters
filtered_apps = apps_df[
    (apps_df['Installs'] >= 10000) &
    (apps_df['Revenue'] >= 10000) &
    (apps_df['Android Ver'].str.extract(r'(\d+\.\d+)').astype(float) > 4.0).squeeze() &
    (apps_df['Size'] > 15) &
    (apps_df['Content Rating'] == 'Everyone') &
    (apps_df['App'].str.len() <= 30)
]

# Get top 3 categories by number of apps
top_categories = filtered_apps['Category'].value_counts().head(3).index

# Filter data for top 3 categories
filtered_apps = filtered_apps[filtered_apps['Category'].isin(top_categories)]

# Group by Type (Free vs Paid) and Category, calculating average installs & revenue
category_summary = filtered_apps.groupby(['Category', 'Type']).agg(
    Avg_Installs=('Installs', 'mean'),
    Avg_Revenue=('Revenue', 'mean')
).reset_index()

# Restrict execution to 1 PM - 2 PM IST
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(ist)
allowed_start = current_time.replace(hour=13, minute=0, second=0, microsecond=0)
allowed_end = current_time.replace(hour=14, minute=0, second=0, microsecond=0)

if allowed_start <= current_time <= allowed_end:
    # Plot dual-axis chart
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Bar chart for average installs
    sns.barplot(data=category_summary, x='Category', y='Avg_Installs', hue='Type', ax=ax1, alpha=0.7, palette='coolwarm')
    ax1.set_ylabel('Average Installs')
    ax1.set_xlabel('Category')
    ax1.set_title('Comparison of Average Installs and Revenue (Free vs Paid)')
    
    # Line chart for average revenue
    ax2 = ax1.twinx()
    sns.lineplot(data=category_summary, x='Category', y='Avg_Revenue', hue='Type', marker='o', ax=ax2, linestyle='dashed', palette='coolwarm')
    ax2.set_ylabel('Average Revenue ($)')

    # Show legend
    ax1.legend(loc='upper left', title="Type (Bar)")
    ax2.legend(loc='upper right', title="Type (Line)")

    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.show()
else:
    print("The chart is only visible between 1 PM - 2 PM IST.")


The chart is only visible between 1 PM - 2 PM IST.
