In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import pytz

# Load the cleaned dataset
df = pd.read_csv("cleaned_apps_data.csv")

print("✅ Cleaned data loaded successfully!")
print(df.head())


✅ Cleaned data loaded successfully!
                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

   Reviews  Size  Installs  Type  Price Content Rating  \
0      159  19.0     10000  Free    0.0       Everyone   
1      967  14.0    500000  Free    0.0       Everyone   
2    87510   8.7   5000000  Free    0.0       Everyone   
3   215644  25.0  50000000  Free    0.0           Teen   
4      967   2.8    100000  Free    0.0       Everyone   

                      Genres Last Updated         Current Ver   Android Ver  \
0               Art & Design   2018-01-07              

In [3]:
apps_df = df.copy()
print(apps_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8892 entries, 0 to 8891
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             8892 non-null   object 
 1   Category        8892 non-null   object 
 2   Rating          8892 non-null   float64
 3   Reviews         8892 non-null   int64  
 4   Size            7424 non-null   float64
 5   Installs        8892 non-null   int64  
 6   Type            8892 non-null   object 
 7   Price           8892 non-null   float64
 8   Content Rating  8892 non-null   object 
 9   Genres          8892 non-null   object 
 10  Last Updated    8892 non-null   object 
 11  Current Ver     8892 non-null   object 
 12  Android Ver     8892 non-null   object 
 13  Log_Installs    8892 non-null   float64
 14  Log_Reviews     8892 non-null   float64
 15  Rating_Group    8892 non-null   object 
 16  Revenue         8892 non-null   float64
 17  Year            8892 non-null   i

In [None]:
# Ensure consistent types
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

df['Size_M'] = (
    df['Size']
    .astype(str)
    .str.replace('M', '', regex=False)
    .replace(['Varies with device', 'nan', 'None'], None)
)
df['Size_M'] = pd.to_numeric(df['Size_M'], errors='coerce')

df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')
df[['Size', 'Size_M']].head(10)
print(df.info())




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8892 entries, 0 to 8891
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             8892 non-null   object        
 1   Category        8892 non-null   object        
 2   Rating          8892 non-null   float64       
 3   Reviews         8892 non-null   int64         
 4   Size            7424 non-null   float64       
 5   Installs        8892 non-null   int64         
 6   Type            8892 non-null   object        
 7   Price           8892 non-null   float64       
 8   Content Rating  8892 non-null   object        
 9   Genres          8892 non-null   object        
 10  Last Updated    8892 non-null   datetime64[ns]
 11  Current Ver     8892 non-null   object        
 12  Android Ver     8892 non-null   object        
 13  Log_Installs    8892 non-null   float64       
 14  Log_Reviews     8892 non-null   float64       
 15  Rati

In [8]:
# Filter by conditions
filtered_df = df[
    (df['Rating'] >= 4.0) &
    (df['Size_M'] >= 10) &
    (df['Last Updated'].dt.month == 1)
]

In [11]:
agg_df = filtered_df.groupby('Category').agg({
    'Rating': 'mean',
    'Reviews': 'sum',
    'Installs': 'sum'
}).reset_index()


In [12]:
# Top 10 categories by installs
top_10 = agg_df.sort_values('Installs', ascending=False).head(10)
print(top_10)

            Category    Rating  Reviews   Installs
7             FAMILY  4.395455  4544623  182494820
19            SPORTS  4.342857  1982017  120511000
10              GAME  4.313333  2397589  115691000
5      ENTERTAINMENT  4.300000   869111   21000000
14   PERSONALIZATION  4.475000   155996   15060000
15       PHOTOGRAPHY  4.150000   563720   10500000
11         LIFESTYLE  4.380000    42809    5071000
4          EDUCATION  4.400000    57645    2000000
20             TOOLS  4.200000     8010    1010000
21  TRAVEL_AND_LOCAL  4.100000      974    1001000


In [13]:
# Time-based condition (IST)
# ==============================
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(ist)
current_hour = current_time.hour

In [14]:
if 15 <= current_hour < 17:
    fig, ax = plt.subplots(figsize=(12, 6))

    x = range(len(top_10))
    width = 0.35

    ax.bar([i - width/2 for i in x], top_10['Rating'], width=width, label='Average Rating')
    ax.bar([i + width/2 for i in x], top_10['Reviews'], width=width, label='Total Reviews')

    ax.set_xticks(x)
    ax.set_xticklabels(top_10['Category'], rotation=45, ha='right')
    ax.set_title('Average Rating vs Total Reviews for Top 10 App Categories (Filtered)')
    ax.set_ylabel('Values')
    ax.legend()
    plt.tight_layout()
    plt.show()

else:
    print("Chart is not available. It can only be displayed between 3 PM and 5 PM IST.")


Chart is not available. It can only be displayed between 3 PM and 5 PM IST.
