In [108]:
# Interactive Choropleth Map for Global Installs by Category

'''
Create an interactive Choropleth map using Plotly to visualize global installs by Category. 
Apply filters to show data for only the top 5 app categories and highlight category where the number of installs exceeds 1 million.
The app category should not start with the characters “A,” “C,” “G,” or “S.” 
This graph should work only between 6 PM IST and 8 PM IST; apart from that time, we should not show it in the dashboard itself.

'''

'\nCreate an interactive Choropleth map using Plotly to visualize global installs by Category. \nApply filters to show data for only the top 5 app categories and highlight category where the number of installs exceeds 1 million.\nThe app category should not start with the characters “A,” “C,” “G,” or “S.” \nThis graph should work only between 6 PM IST and 8 PM IST; apart from that time, we should not show it in the dashboard itself.\n\n'

In [109]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [110]:
apps_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned_GooglePlaystore.csv")
reviews_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned_UserReviews.csv")

In [111]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.21034,5.068904,Top rated app,0.0,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,Above average,0.0,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424948,11.379508,Top rated app,0.0,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281384,Top rated app,0.0,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512925,6.874198,Top rated app,0.0,2018


In [112]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249
3,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369
4,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369


In [113]:
apps_df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver', 'Log_Installs', 'Log_Reviews', 'Rating_Group', 'Revenue',
       'Year'],
      dtype='object')

In [114]:
reviews_df.columns

Index(['App', 'Translated_Review', 'Sentiment', 'Sentiment_Polarity',
       'Sentiment_Subjectivity', 'Sentiment_Score'],
      dtype='object')

In [115]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size              float64
Installs            int64
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
Log_Installs      float64
Log_Reviews       float64
Rating_Group       object
Revenue           float64
Year                int64
dtype: object

In [116]:
reviews_df.dtypes

App                        object
Translated_Review          object
Sentiment                  object
Sentiment_Polarity        float64
Sentiment_Subjectivity    float64
Sentiment_Score           float64
dtype: object

In [117]:
apps_df.isnull().sum()

App                  0
Category             0
Rating               0
Reviews              0
Size              1425
Installs             0
Type                 0
Price                0
Content Rating       0
Genres               0
Last Updated         0
Current Ver          4
Android Ver          2
Log_Installs         0
Log_Reviews          0
Rating_Group         0
Revenue              0
Year                 0
dtype: int64

In [118]:
reviews_df.isnull().sum()

App                       0
Translated_Review         0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
Sentiment_Score           0
dtype: int64

In [119]:
# Data Cleaning - Handle null values in Size column
# Since Size isn't critical for our analysis, we'll just drop rows with null Size values
apps_df = apps_df.dropna(subset=['Size'])

In [120]:
apps_df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       3
Android Ver       2
Log_Installs      0
Log_Reviews       0
Rating_Group      0
Revenue           0
Year              0
dtype: int64

In [121]:
# here don't consider current version and android version also, as we are not using it anymore to this task
# this is the visualization for installs by category

import plotly.express as px

apps_df = pd.read_csv("E:/Data Analyst Intern Tasks/Jupyter/Cleaned_GooglePlaystore.csv")

apps_df = apps_df.dropna(subset=['Installs'])
apps_df = apps_df.drop(columns=['Current Ver', 'Android Ver'], errors='ignore')  

In [122]:
# Create the bar chart visualization
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)

fig = px.bar(
    x=installs_by_category.index,
    y=installs_by_category.values,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Top 10 Categories by Installs',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=800,  
    height=500   
)

In [123]:
fig.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 20},
    xaxis=dict(
        title_font={'size': 14},
        tickfont={'size': 12},
        gridcolor='rgba(255, 255, 255, 0.1)'),
    yaxis=dict(
        title_font={'size': 14},
        tickfont={'size': 12}),
    margin=dict(l=100, r=50, t=80, b=50),
    showlegend=False)

In [124]:
fig.update_traces(
    texttemplate='%{x:.2s}',  
    textposition='outside',
    marker=dict(line=dict(color='white', width=1)))

In [125]:
def save_plotly_html(fig, filename, title=None):
    """Save Plotly figure to HTML with custom title in metadata"""
    if title:
        fig.update_layout(title=title)
    fig.write_html(
        file=filename,
        full_html=True,
        include_plotlyjs='cdn',
        default_height='100%')
    print(f"Successfully saved visualization to {filename}")

save_plotly_html(fig,'Installs by Category.html','Top 10 Categories by Installs - Google Play Store Analysis')
fig.show()

Successfully saved visualization to Installs by Category.html


In [126]:
# we can perform another visualization with few changes

apps_df = apps_df.dropna(subset=['Installs', 'Category'])
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)

In [127]:
fig = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x':'Total Installs', 'y':'Category'},
    title='Top 10 App Categories by Installs',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues_r,
    width=600,
    height=400
)

In [128]:
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font_color='black',
    title_font={'size':18},
    margin=dict(l=100, r=20, t=50, b=20)
)

In [129]:
fig.update_traces(
    texttemplate='%{x:.2s}',
    textposition='outside'
)

In [130]:
fig.write_html("Installs_by_Category.html")
fig.show()

In [131]:
import pandas as pd
import plotly.express as px
from datetime import datetime, time
import pytz

In [132]:
# Function to check if current time is between 6 PM and 8 PM IST
def is_display_time():
    ist = pytz.timezone('Asia/Kolkata')
    current_time = datetime.now(ist).time()
    return time(18, 0) <= current_time <= time(20, 0)

In [133]:
# Prepare the data for the choropleth map

'''steps -
a. Filter categories that don't start with A, C, G, or S 
b. Group by Category and sum Installs
c. Get top 5 categories by installs
d. Filter the original data for only these top 5 categories
e. Group by Category and Country 
f. Add a highlight column for categories with > 1 million installs

'''

"steps -\na. Filter categories that don't start with A, C, G, or S \nb. Group by Category and sum Installs\nc. Get top 5 categories by installs\nd. Filter the original data for only these top 5 categories\ne. Group by Category and Country \nf. Add a highlight column for categories with > 1 million installs\n\n"

In [134]:
def prepare_choropleth_data():
    filtered_categories = apps_df[~apps_df['Category'].str.startswith(('A', 'C', 'G', 'S'), na=False)]
    category_installs = filtered_categories.groupby('Category')['Installs'].sum().reset_index()
    top_5_categories = category_installs.nlargest(5, 'Installs')
    filtered_data = filtered_categories[filtered_categories['Category'].isin(top_5_categories['Category'])]

    
    # Group by Category and Country (assuming we have country data - if not, we can modify it)
    # Since the dataset doesn't have country info, we will assume global distribution
    # We'll create a dummy country column for demonstration
    filtered_data['Country'] = 'World' 
    
    map_data = filtered_data.groupby(['Country', 'Category'])['Installs'].sum().reset_index()
    
    # Add a highlight column for categories with > 1 million installs
    map_data['Highlight'] = map_data['Installs'] > 1000000
    return map_data

In [135]:
# Create the interactive choropleth map
def create_choropleth_map():
    if not is_display_time():
        print("The choropleth map is only available between 6 PM and 8 PM IST.")
        return None
    
    map_data = prepare_choropleth_data()
    
    # Create the choropleth map
    fig = px.choropleth(
        map_data,
        locations='Country',  
        color='Installs',
        hover_name='Category',
        animation_frame='Category',
        title='Global Installs by Top 5 Categories (excluding A,C,G,S starters)',
        color_continuous_scale=px.colors.sequential.Plasma,
        scope='world',
        labels={'Installs': 'Total Installs'},
        height=600)
    
    fig.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=True,
            projection_type='equirectangular'),
        title={
            'text': "Global Installs by Top 5 Categories (Excluding A,C,G,S starters)",
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        margin={"r":0,"t":0,"l":0,"b":0})
    
    # Highlight categories with > 1 million installs
    fig.update_traces(
        marker=dict(
            line=dict(
                width=2,
                color='DarkSlateGrey')
        ),
        selector=dict(
            customdata=map_data[map_data['Highlight']]['Category']))
    return fig

In [136]:
# Display the map if within the specified time
if is_display_time():
    choropleth_fig = create_choropleth_map()
    if choropleth_fig:
        choropleth_fig.show()
else:
    print("The choropleth map is only available between 6 PM and 8 PM IST.")

The choropleth map is only available between 6 PM and 8 PM IST.
