In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from datetime import datetime, time
import pytz
import warnings
warnings.filterwarnings('ignore')  

In [2]:
# Time Validation Function
def is_within_time_window():
    """Check if current time is within allowed 6PM-9PM IST window"""
    ist = pytz.timezone('Asia/Kolkata')
    current_time = datetime.now(ist).time()
    allowed_start = time(18, 0)
    allowed_end = time(21, 0)
    return allowed_start <= current_time <= allowed_end

In [3]:
# Data Loading and Preparation
def load_and_prepare_data():
    """Load and prepare the datasets with all required filters"""
    apps_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned Datasets/Cleaned_GooglePlaystore.csv")
    reviews_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned Datasets/Cleaned_UserReviews.csv")
    
    # Convert date and create YearMonth column
    apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], format='%d-%m-%Y', errors='coerce')
    apps_df = apps_df.dropna(subset=['Last Updated'])
    apps_df['YearMonth'] = apps_df['Last Updated'].dt.to_period('M')
    
    return apps_df, reviews_df

In [4]:
# Data Filtering
def filter_apps(apps_df, reviews_df):
    """Apply all specified filters to the apps data"""
    # Merge with reviews data to get sentiment information
    merged_df = pd.merge(apps_df, reviews_df.groupby('App')['Sentiment'].mean(), 
                         on='App', how='left')
    filtered = apps_df[
        (~apps_df['App'].str.lower().str.startswith(('x', 'y', 'z'))) &
        (apps_df['Category'].str.upper().str.startswith(('E', 'C', 'B'))) &
        (apps_df['Reviews'] > 500) &
        (~apps_df['App'].str.contains('S', case=False)) &
        (apps_df['Installs'] > 0)  # Exclude apps with 0 installs
    ].copy()
    
    return filtered

In [5]:
# Category Translation
def translate_categories(df):
    """Apply category translations as specified"""
    category_translations = {
        'Beauty': 'सौंदर्य',
        'Business': 'வணிகம்',  
        'Dating': 'Dating' 
    }
    df['Translated_Category'] = df['Category'].replace(category_translations)
    return df

In [6]:
# Time Series Analysis
def prepare_time_series_data(filtered_df):
    """Prepare time series data with growth calculations"""
    ts_data = filtered_df.groupby(['Translated_Category', 'YearMonth'])['Installs'].sum().unstack(level=0)
    
    # Fill missing months with 0 and forward fill
    ts_data = ts_data.resample('M').asfreq().fillna(0)

    growth = ts_data.pct_change()
    significant_growth = growth > 0.2
    
    return ts_data, growth, significant_growth

In [7]:
# Visualization
def create_visualization(ts_data, significant_growth):
    """Create the time series visualization with growth highlights"""
    plt.figure(figsize=(16, 8))
    ax = plt.gca()
    
    # Custom color palette for categories
    colors = plt.cm.tab20(np.linspace(0, 1, len(ts_data.columns)))
    
    for idx, category in enumerate(ts_data.columns):
        x = ts_data.index.to_timestamp()
        y = ts_data[category]
        
        # Plot main line with custom styling
        line, = plt.plot(x, y, label=category, marker='o', 
                         color=colors[idx], linewidth=2.5, markersize=8)
        
        # Highlight significant growth periods
        for i in range(1, len(significant_growth)):
            if significant_growth.iloc[i][category]:
                # Create gradient-filled growth areas
                plt.fill_between(
                    x[i-1:i+1],
                    y[i-1:i+1],
                    color=colors[idx],
                    alpha=0.2,
                    edgecolor=colors[idx],
                    linewidth=0.5
                )
                # Add growth annotation
                if y[i] > 0:
                    ax.annotate(f"+{growth.iloc[i][category]*100:.0f}%",
                               xy=(x[i], y[i]),
                               xytext=(5, 5), textcoords='offset points',
                               color=colors[idx], fontweight='bold')
    
    # Advanced formatting
    plt.title('Trend of Total Installs by Category\n(Highlighting >20% Month-over-Month Growth)', 
              fontsize=14, pad=20)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Total Installs (log scale)', fontsize=12)
    plt.yscale('log')
    
    # Custom grid and spines
    ax.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.7)
    for spine in ['top', 'right']:
        ax.spines[spine].set_visible(False)
    
    # Enhanced legend
    legend = plt.legend(title='Category', bbox_to_anchor=(1.05, 1), 
                        loc='upper left', framealpha=1)
    plt.setp(legend.get_title(), fontweight='bold')
    
    # Rotate and align x-axis labels
    plt.xticks(rotation=45, ha='right')
    
    # Adjust layout
    plt.tight_layout()
    
    # Add watermark with time info
    plt.text(0.95, 0.02, f"Generated at {datetime.now().strftime('%Y-%m-%d %H:%M')} IST",
             transform=ax.transAxes, fontsize=8, color='gray', alpha=0.7,
             ha='right', va='bottom')
    
    return plt

In [8]:
#  Main Execution Flow
if __name__ == "__main__":
    if is_within_time_window():
        print("Processing data...")
        
        # Data pipeline
        apps_df, reviews_df = load_and_prepare_data()
        filtered_apps = filter_apps(apps_df, reviews_df)
        translated_apps = translate_categories(filtered_apps)
        ts_data, growth, significant_growth = prepare_time_series_data(translated_apps)
        
        # Visualization
        plot = create_visualization(ts_data, significant_growth)
        plot.show()
    else:
        current_time = datetime.now(pytz.timezone('Asia/Kolkata')).strftime('%H:%M')
        print(f"Current time is {current_time} IST. Graph display is only available between 18:00-21:00 IST.")

Current time is 01:07 IST. Graph display is only available between 18:00-21:00 IST.
