In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
import pytz
import matplotlib.font_manager as fm

In [20]:
apps_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned Datasets/Cleaned_GooglePlaystore.csv")
reviews_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned Datasets/Cleaned_UserReviews.csv")

In [21]:
def clean_size(size_str):
    if pd.isna(size_str):
        return np.nan
    if isinstance(size_str, (int, float)):
        return size_str
    size_str = str(size_str)
    if 'M' in size_str:
        return float(size_str.replace('M', ''))
    elif 'k' in size_str:
        return float(size_str.replace('k', '')) / 1024
    elif size_str == 'Varies with device':
        return np.nan
    else:
        try:
            return float(size_str)
        except:
            return np.nan


In [22]:
apps_df['Size'] = apps_df['Size'].apply(clean_size)
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')

In [23]:
merged_data = pd.merge(apps_df, reviews_df, on='App', how='inner')

In [24]:
# this graph will be avaialable at the evening between 6-7

In [25]:
def filter_data(df):
    # Current time check (IST 5PM to 7PM)
    ist = pytz.timezone('Asia/Kolkata')
    current_time = datetime.now(ist).time()
    start_time = datetime.strptime('17:00:00', '%H:%M:%S').time()
    end_time = datetime.strptime('19:00:00', '%H:%M:%S').time()
    
    if not (start_time <= current_time <= end_time):
        return pd.DataFrame()  # Return empty dataframe if not in time window
    
    filtered = df[
        (df['Rating'] > 3.5) &
        (df['Category'].isin([
            'GAME', 'BEAUTY', 'BUSINESS', 'COMICS', 'COMMUNICATION', 
            'DATING', 'ENTERTAINMENT', 'SOCIAL', 'EVENTS'
        ])) &
        (df['Reviews'] > 500) &
        (~df['App'].str.contains('s', case=False)) &
        (df['Sentiment_Subjectivity'] > 0.5) &
        (df['Installs'] > 50000)
    ].copy()
    
    return filtered

filtered_data = filter_data(merged_data)

In [26]:
if not filtered_data.empty:
    # Use Unicode-compatible font
    plt.rcParams['font.family'] = 'DejaVu Sans'

    # Translate categories
    category_translations = {
        'BEAUTY': 'सौंदर्य (Beauty)',     # Hindi
        'BUSINESS': 'வணிகம் (Business)',  # Tamil
        'DATING': 'Partnersuche (Dating)' # German
    }
    filtered_data['Category_Display'] = filtered_data['Category'].replace(category_translations)

    # Color mapping
    colors = ['pink' if cat == 'GAME' else 'skyblue' for cat in filtered_data['Category']]

    # Plot
    plt.figure(figsize=(16, 12), constrained_layout=True)
    scatter = plt.scatter(
        x=filtered_data['Size'],
        y=filtered_data['Rating'],
        s=filtered_data['Installs'] / 10000,
        c=colors,
        alpha=0.6,
        edgecolors='grey',
        linewidth=0.5
    )

    # Title and labels
    plt.title('App Size vs Rating (Bubble Size = Installs)\nFiltered by: Rating > 3.5, Reviews > 500, Installs > 50K', 
              pad=20, fontsize=14)
    plt.xlabel('App Size (MB)', fontsize=12)
    plt.ylabel('Average Rating', fontsize=12)

    # Legend
    game_patch = mpatches.Patch(color='pink', label='Game')
    other_patch = mpatches.Patch(color='skyblue', label='Other Categories')
    plt.legend(handles=[game_patch, other_patch], title='Categories', loc='upper right')

    # Informative footer
    plt.figtext(0.5, 0.01, 
                "Note: Visible only between 5PM–7PM IST | सौंदर्य=Beauty | வணிகம்=Business | Partnersuche=Dating",
                ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})

    # Annotate large bubbles
    for i, row in filtered_data.iterrows():
        if row['Installs'] > 1000000:
            plt.annotate(
                row['App'], 
                (row['Size'], row['Rating']),
                textcoords="offset points",
                xytext=(0, 5),
                ha='center',
                fontsize=8
            )

    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    current_time = datetime.now(pytz.timezone('Asia/Kolkata')).strftime('%H:%M:%S')
    print(f"⏰ Current time is {current_time} IST. The chart is visible only between 17:00 and 19:00 IST.")
    print("Or no data matches the specified filters.")


⏰ Current time is 20:03:20 IST. The chart is visible only between 17:00 and 19:00 IST.
Or no data matches the specified filters.
