In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
import pytz

In [7]:
apps_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned Datasets/Cleaned_GooglePlaystore.csv")
reviews_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned Datasets/Cleaned_UserReviews.csv")

In [9]:
# Data cleaning and preprocessing for Size column
def clean_size(size_str):
    if pd.isna(size_str):
        return np.nan
    if isinstance(size_str, (int, float)):
        return size_str
    if 'M' in size_str:
        return float(size_str.replace('M', ''))
    elif 'k' in size_str:
        return float(size_str.replace('k', '')) / 1024
    elif size_str == 'Varies with device':
        return np.nan
    else:
        return float(size_str)

# Apply the cleaning function
apps_df['Size'] = apps_df['Size'].apply(clean_size)

In [None]:
# this graph will be avaialable at the evening between 6-7

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from datetime import datetime
import pytz
import matplotlib.font_manager as fm

# Load datasets
apps_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned Datasets/Cleaned_GooglePlaystore.csv")
reviews_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned Datasets/Cleaned_UserReviews.csv")

# Data cleaning and preprocessing for Size column
def clean_size(size_str):
    if pd.isna(size_str):
        return np.nan
    if isinstance(size_str, (int, float)):
        return size_str
    size_str = str(size_str)
    if 'M' in size_str:
        return float(size_str.replace('M', ''))
    elif 'k' in size_str:
        return float(size_str.replace('k', '')) / 1024
    elif size_str == 'Varies with device':
        return np.nan
    else:
        try:
            return float(size_str)
        except:
            return np.nan

# Apply the cleaning function
apps_df['Size'] = apps_df['Size'].apply(clean_size)

# Convert Reviews to numeric (if not already)
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')

# Merge datasets
merged_data = pd.merge(apps_df, reviews_df, on='App', how='inner')

# Filter conditions
def filter_data(df):
    # Current time check (IST 5PM to 7PM)
    ist = pytz.timezone('Asia/Kolkata')
    current_time = datetime.now(ist).time()
    start_time = datetime.strptime('17:00:00', '%H:%M:%S').time()
    end_time = datetime.strptime('19:00:00', '%H:%M:%S').time()
    
    if not (start_time <= current_time <= end_time):
        return pd.DataFrame()  # Return empty dataframe if not in time window
    
    # Apply all other filters
    filtered = df[
        (df['Rating'] > 3.5) &
        (df['Category'].isin(['GAME', 'BEAUTY', 'BUSINESS', 'COMICS', 'COMMUNICATION', 
                             'DATING', 'ENTERTAINMENT', 'SOCIAL', 'EVENTS'])) &
        (df['Reviews'] > 500) &
        (~df['App'].str.contains('S', case=False)) &
        (df['Sentiment_Subjectivity'] > 0.5) &
        (df['Installs'] > 50000)
    ].copy()
    
    return filtered

filtered_data = filter_data(merged_data)

# If data is available (and within time window), create the plot
if not filtered_data.empty:
    # Set a font that supports Unicode characters
    try:
        # Try to use Arial Unicode MS if available
        plt.rcParams['font.family'] = 'Arial Unicode MS'
    except:
        # Fall back to a font that might support some Unicode
        plt.rcParams['font.family'] = 'DejaVu Sans'
    
    # Translate category names (using English fallbacks for display issues)
    category_translations = {
        'BEAUTY': 'Beauty (सुंदरता)',  # Hindi with English fallback
        'BUSINESS': 'Business (வணிக)',  # Tamil with English fallback
        'DATING': 'Dating'              # German (same as English)
    }
    
    filtered_data['Category_Display'] = filtered_data['Category'].replace(category_translations)
    
    # Color mapping - Game in pink, others in blue
    colors = ['pink' if cat == 'GAME' else 'skyblue' for cat in filtered_data['Category']]
    
    # Create figure with constrained layout to prevent label cutoff
    plt.figure(figsize=(16, 12), constrained_layout=True)
    
    # Create bubble chart
    scatter = plt.scatter(
        x=filtered_data['Size'],
        y=filtered_data['Rating'],
        s=filtered_data['Installs']/10000,  # Scale down for visibility
        c=colors,
        alpha=0.6,
        edgecolors='grey',
        linewidth=0.5
    )
    
    # Add labels and title
    plt.title('App Size vs Rating (Bubble Size = Installs)\nFiltered: Rating > 3.5, Reviews > 500, Installs > 50k', 
              pad=20, fontsize=14)
    plt.xlabel('App Size (MB)', fontsize=12)
    plt.ylabel('Average Rating', fontsize=12)
    
    # Create legend with fixed location to avoid performance issues
    game_patch = mpatches.Patch(color='pink', label='Game')
    other_patch = mpatches.Patch(color='skyblue', label='Other Categories')
    plt.legend(handles=[game_patch, other_patch], title='Categories', loc='upper right')
    
    # Add informative text about filters (simplified to avoid font issues)
    plt.figtext(0.5, 0.01, 
                "Note: Only shows between 5PM-7PM IST | Beauty=Beauty | Business=Business | Dating=Dating",
                ha="center", fontsize=10, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
    
    # Add app names as annotations for larger bubbles
    for i, row in filtered_data.iterrows():
        if row['Installs'] > 1000000:  # Only label very popular apps
            plt.annotate(
                row['App'], 
                (row['Size'], row['Rating']),
                textcoords="offset points",
                xytext=(0,5),
                ha='center',
                fontsize=8
            )
    
    # Show plot
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    # Display message if not in time window or no data matches filters
    current_time = datetime.now(pytz.timezone('Asia/Kolkata')).strftime('%H:%M:%S')
    print(f"Current time is {current_time} IST. Graph only available between 17:00-19:00 IST.")
    print("Or no data matches all the specified filters.")

Current time is 19:01:05 IST. Graph only available between 17:00-19:00 IST.
Or no data matches all the specified filters.
