In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "browser"
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\gunri\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
apps_df = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')

In [4]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [5]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [6]:
#pd.read_csv() : csv files
#pd.read_excel() : excel files
#pd.read_json() : JSON files
#pd.read_sql() : SQL databases

In [7]:
#df.isnull(): Recognize missing values
#df.dropna(): Remove missing values
#df.fillna(): Fill in missing values

In [8]:
#df.duplicated(): Recognize duplicate values in rows
#df.drop_duplicates(): Remove duplicate values in rows

In [9]:
#STEP 2: DATA CLEANING
#handle missing values and duplicates
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    apps_df[column].fillna(apps_df[column].mode()[0],inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating']<=5]
reviews_df.dropna(subset=['Translated_Review'],inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [10]:
apps_df.dtypes


App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [11]:
#DATA TRANSFORMATION
#Converting installs column to numeric by removing '+' and ',' characters
apps_df['Installs'] = apps_df['Installs'].str.replace(',','').str.replace('+','').astype(int)

#converting price column to numeric by removing '$' character
apps_df['Price'] = apps_df['Price'].str.replace('$','').astype(float)

In [12]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [13]:
#Merge data
merged_df = pd.merge(apps_df,reviews_df,on = 'App',how = 'inner')

In [14]:
merged_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.25,1.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.0,0.0
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.5,0.6
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I hate,Negative,-0.8,0.9


In [15]:
#Size Conversion
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M',''))
    elif 'k' in size:
        return float(size.replace('k',''))/1024
    else:
        return np.nan
apps_df['Size']=apps_df['Size'].apply(convert_size)


In [16]:
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [17]:
#DSA Example 1: Dictionary / Hash Map
# Count apps per category using dictionary (Hash Map)
def count_apps_by_category(apps):
    category_count = {}
    for app in apps:
        category = app['Category']
        if category not in category_count:
            category_count[category] = 0
        category_count[category] += 1
    return category_count

apps_list = apps_df.to_dict("records")
category_stats_dsa = count_apps_by_category(apps_list)
print("Apps per category (using DSA Hash Map):")
print(category_stats_dsa)

Apps per category (using DSA Hash Map):
{'ART_AND_DESIGN': 62, 'AUTO_AND_VEHICLES': 73, 'BEAUTY': 42, 'BOOKS_AND_REFERENCE': 177, 'BUSINESS': 270, 'COMICS': 58, 'COMMUNICATION': 307, 'DATING': 159, 'EDUCATION': 129, 'ENTERTAINMENT': 111, 'EVENTS': 45, 'FINANCE': 317, 'FOOD_AND_DRINK': 106, 'HEALTH_AND_FITNESS': 262, 'HOUSE_AND_HOME': 68, 'LIBRARIES_AND_DEMO': 65, 'LIFESTYLE': 305, 'GAME': 1074, 'FAMILY': 1718, 'MEDICAL': 302, 'SOCIAL': 244, 'SHOPPING': 202, 'PHOTOGRAPHY': 304, 'SPORTS': 286, 'TRAVEL_AND_LOCAL': 205, 'TOOLS': 734, 'PERSONALIZATION': 310, 'PRODUCTIVITY': 334, 'PARENTING': 50, 'WEATHER': 75, 'VIDEO_PLAYERS': 160, 'NEWS_AND_MAGAZINES': 214, 'MAPS_AND_NAVIGATION': 124}


In [18]:
#DSA Example 2: Sorting (Bubble Sort)
# Sort apps by Rating using Bubble Sort (DSA)
def bubble_sort(apps):
    n = len(apps)
    for i in range(n):
        for j in range(0, n-i-1):
            if apps[j]['Rating'] > apps[j+1]['Rating']:
                apps[j], apps[j+1] = apps[j+1], apps[j]
    return apps

# Take first 10 apps for demo
sample_apps = apps_list[:10]
sorted_apps = bubble_sort(sample_apps.copy())
print("Apps sorted by rating (using Bubble Sort):")
for app in sorted_apps:
    print(app['App'], app['Rating'])

Apps sorted by rating (using Bubble Sort):
Smoke Effect Photo Maker - Smoke Editor 3.8
Coloring book moana 3.9
Photo Editor & Candy Camera & Grid & ScrapBook 4.1
Infinite Painter 4.1
Pixel Draw - Number Art Coloring Book 4.3
Paper flowers instructions 4.4
Garden Coloring Book 4.4
Sketch - Draw & Paint 4.5
U Launcher Lite – FREE Live Cool Themes, Hide Apps 4.7
Kids Paint Free - Drawing Fun 4.7


In [19]:
#DSA Example 3: Searching (Linear & Binary Search)
# Linear Search for app by name
def linear_search(apps, target_name):
    for app in apps:
        if app['App'] == target_name:
            return app
    return None

# Binary Search (requires sorted list by name)
def binary_search(apps, target_name):
    low, high = 0, len(apps)-1
    while low <= high:
        mid = (low + high) // 2
        if apps[mid]['App'] == target_name:
            return apps[mid]
        elif apps[mid]['App'] < target_name:
            low = mid + 1
        else:
            high = mid - 1
    return None

print("Linear Search Example:")
print(linear_search(apps_list[:100], 'Textgram - write on photos'))

apps_sorted_by_name = sorted(apps_list[:100], key=lambda x: x['App'])
print("Binary Search Example:")
print(binary_search(apps_sorted_by_name, 'Textgram - write on photos'))

Linear Search Example:
{'App': 'Textgram - write on photos', 'Category': 'ART_AND_DESIGN', 'Rating': 4.4, 'Reviews': '295221', 'Size': nan, 'Installs': 10000000, 'Type': 'Free', 'Price': 0.0, 'Content Rating': 'Everyone', 'Genres': 'Art & Design', 'Last Updated': 'July 30, 2018', 'Current Ver': 'Varies with device', 'Android Ver': 'Varies with device'}
Binary Search Example:
{'App': 'Textgram - write on photos', 'Category': 'ART_AND_DESIGN', 'Rating': 4.4, 'Reviews': '295221', 'Size': nan, 'Installs': 10000000, 'Type': 'Free', 'Price': 0.0, 'Content Rating': 'Everyone', 'Genres': 'Art & Design', 'Last Updated': 'July 30, 2018', 'Current Ver': 'Varies with device', 'Android Ver': 'Varies with device'}


In [20]:
#DSA Example 4: Stack & Queue Simulation
# Stack example: Recently viewed apps
stack = []
def view_app(app_name):
    stack.append(app_name)

def recent_app():
    return stack[-1] if stack else None

view_app("Instagram")
view_app("Facebook")
view_app("Twitter")
print("Recently viewed app (using Stack):", recent_app())

# Queue example: Download requests
from collections import deque
queue = deque()

def request_download(app_name):
    queue.append(app_name)

def process_download():
    return queue.popleft() if queue else None

request_download("WhatsApp")
request_download("Spotify")
print("Processing download (using Queue):", process_download())
print("Next in queue:", process_download())

Recently viewed app (using Stack): Twitter
Processing download (using Queue): WhatsApp
Next in queue: Spotify


In [21]:
 #DSA Example 5: Graph Representation
# Graph example: Categories as nodes with edges between similar categories
graph = {}

def add_edge(cat1, cat2):
    if cat1 not in graph:
        graph[cat1] = []
    if cat2 not in graph:
        graph[cat2] = []
    graph[cat1].append(cat2)
    graph[cat2].append(cat1)

# Example: connect some categories
add_edge("GAME", "FAMILY")
add_edge("FAMILY", "LIFESTYLE")
add_edge("GAME", "ENTERTAINMENT")

print("Graph Representation of Categories:")
for node, edges in graph.items():
    print(node, "->", edges)

Graph Representation of Categories:
GAME -> ['FAMILY', 'ENTERTAINMENT']
FAMILY -> ['GAME', 'LIFESTYLE']
LIFESTYLE -> ['FAMILY']
ENTERTAINMENT -> ['GAME']


In [22]:
#Add new feautures
#Logarithmic Transformation
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
#apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews']) #this shows error because its datatype is still object and not numeric

In [23]:
apps_df['Reviews'] = apps_df["Reviews"].astype(int)

In [24]:
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])

In [25]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews             int32
Size              float64
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
Log_Installs      float64
Log_Reviews       float64
dtype: object

In [26]:
#categorise rating
def rating_group(rating):
    if rating >= 4:
        return 'Top rated app'
    elif rating >= 3:
        return 'Above average app'
    elif rating >= 2:
        return 'Average app'
    else:
        return 'Below average app'
apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)        

In [27]:
#deriving a metric
#revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

In [28]:
#SENIMENT ANALYSIS
sia = SentimentIntensityAnalyzer()

In [29]:
#Polarity scores in SIA
#Postive, Negative, Neutral, Compound:-1 means very negative and +1 means very positive

In [30]:
review = "This app is amazing! I love the new feautures."
sentiment_score = sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.42, 'pos': 0.58, 'compound': 0.8516}


In [31]:
review = "This app is very bad! I hate the new feautures."
sentiment_score = sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.535, 'neu': 0.465, 'pos': 0.0, 'compound': -0.8427}


In [32]:
review = "This app is okay!"
sentiment_score = sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.578, 'pos': 0.422, 'compound': 0.2942}


In [33]:
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [34]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369


In [35]:
#extract only year from last updated column
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'],errors = 'coerce')

In [36]:
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [37]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.21044,5.075174,Top rated app,0.0,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122365,6.875232,Above average app,0.0,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424949,11.37952,Top rated app,0.0,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281389,Top rated app,0.0,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512935,6.875232,Top rated app,0.0,2018


In [61]:
apps_df.to_csv("cleaned_apps_data.csv", index=False)
print("✅ Cleaned data saved as cleaned_apps_data.csv")

✅ Cleaned data saved as cleaned_apps_data.csv


In [38]:
#PLOTLY VISUALISATION
#create directory to save html files
html_files_path="./"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

In [39]:
plot_containers=""

In [40]:
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

In [41]:
plot_width = 400
plot_height = 300
plot_bg_color = 'black'
text_color = 'white'
title_font ={'size':16}
axis_font = {'size':12}

In [42]:
#Figure 1
category_counts = apps_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(
    x = category_counts.index,
    y = category_counts.values,
    labels = {'x':'Category','y':'Count'},
    title = 'Top 10 App Categories',
    color = category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width = 400,
    height = 300
)
fig1.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis={'title_font':{'size':12}},
    yaxis={'title_font':{'size':12}},
    margin = dict(l=20,r=20,t=40,b=20)
)
#fig1.update_traces(marker = dict(marker = dict(line = dict(color='white',width=1 ))))
save_plot_as_html(fig1,"Category Graph 1.html","The top categories on Play Store are dominated by tools,entertainment and education apps.")


In [43]:
#Figure 2
#Type Analysis Plot
#Analyze Distribution of free and paid apps
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(
    values = type_counts.values,
    names = type_counts.index,
    title = 'App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width = 400,
    height = 300
)
fig2.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    margin = dict(l=20,r=20,t=40,b=20)
)
#fig1.update_traces(marker = dict(marker = dict(line = dict(color='white',width=1 ))))
save_plot_as_html(fig2,"Type Graph 2.html","Most of the apps on Play Store are free to download, with only a small percentage being paid apps, indicating a strategy to attract a larger user base and monetize through alternative means such as in-app purchases or advertisements.")

In [44]:
#Figure 3
#Rating Distribution Plot
#To examine how the ratings are distributed across different apps
fig3 = px.histogram(
    apps_df,
    x = 'Rating',
    nbins = 20,
    title = 'Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width = 400,
    height = 300
)
fig3.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin = dict(l=20,r=20,t=40,b=20)
)
#fig1.update_traces(marker = dict(marker = dict(line = dict(color='white',width=1 ))))
save_plot_as_html(fig3,"Rating Graph 3.html","Ratings are skewed towards the higher end, suggesting that users tend to give positive feedback for apps they download.")

In [45]:
#Figure 4
#Analyse sentiment scores of user reviews
sentiment_counts = reviews_df['Sentiment_Score'].value_counts()
fig4=px.bar(
    x = sentiment_counts.index,
    y = sentiment_counts.values,
    labels = {'x':'Sentiment Score','y':'Count'},
    title = 'Sentiment Score Distribution',
    color = sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width = 400,
    height = 300
)
fig4.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis={'title_font':{'size':12}},
    yaxis={'title_font':{'size':12}},
    margin = dict(l=20,r=20,t=40,b=20)
)
#fig1.update_traces(marker = dict(marker = dict(line = dict(color='white',width=1 ))))
save_plot_as_html(fig4,"Sentiment Graph 4.html","Sentiments in reviews show a mix of positive and negative feedback, with a slight leaning towards positive sentiments")

In [46]:
#Figure 5
#VIsualizing which genre has more installs
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5=px.bar(
    x = installs_by_category.index,
    y = installs_by_category.values,
    orientation='h',
    labels = {'x':'Installs','y':'Category'},
    title = 'Installs by Category',
    color = installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width = 400,
    height = 300
)
fig5.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis={'title_font':{'size':12}},
    yaxis={'title_font':{'size':12}},
    margin = dict(l=20,r=20,t=40,b=20)
)
#fig1.update_traces(marker = dict(marker = dict(line = dict(color='white',width=1 ))))
save_plot_as_html(fig5,"Installs Graph 5.html","The categories with the highest total installs are social and communication apps, indicating their widespread popularity and usage among users.")

In [47]:
#Figure 6
#number of app updates over the years
#Over the years means line plot
updates_per_year = apps_df['Last Updated'].dt.year.value_counts().sort_index()
fig6=px.line(
    x = updates_per_year.index,
    y = updates_per_year.values,
    labels = {'x':'Year','y':'Number of Updates'},
    title = 'App Updates Over the Years',
    color_discrete_sequence=px.colors.sequential.Viridis,
    width = 400,
    height = 300
)
fig6.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis={'title_font':{'size':12}},
    yaxis={'title_font':{'size':12}},
    margin = dict(l=20,r=20,t=40,b=20)
)
save_plot_as_html(fig6,"Updates Graph 6.html","Updates have generally increased over the years, reflecting ongoing app development and maintenance efforts.")

In [48]:
#Figure 7
#Comparing revenue genenerated by app category
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7=px.bar(
    x = installs_by_category.index,
    y = installs_by_category.values,
    labels = {'x':'Category','y':'Revenue'},
    title = 'Revenue by Category',
    color = installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width = 400,
    height = 300
)
fig7.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin = dict(l=20,r=20,t=40,b=20)
)
save_plot_as_html(fig7,"Revenue Graph 7.html","The categories generating the highest revenue are games and business apps, indicating strong monetization strategies in these sectors.") 

In [49]:
#Figure 8
#Count the genre top 10 most common genres
genre_counts = apps_df['Genres'].str.split(';',expand=True).stack().value_counts().nlargest(10) #Separate multiple genres and then stack them into a single column and then counts occurences and then take top 10
fig8=px.bar(
    x = genre_counts.index,
    y = genre_counts.values,
    labels = {'x':'Genre','y':'Count'},
    title = 'Top 10 App Genres',
    color = genre_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width = 400,
    height = 300
)
fig8.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin = dict(l=20,r=20,t=40,b=20)
)
save_plot_as_html(fig8,"Genre Graph 8.html","The most common genres among apps are tools, entertainment, and education, reflecting user preferences for utility and leisure applications.")

In [50]:
#Figure 9
#Scatter plot
#Relationship between last update date and app rating
fig9=px.scatter(
    apps_df,
    x = 'Last Updated',
    y = 'Rating',
    color = 'Type',
    title = 'App Rating vs Last Updated Date',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width = 400,
    height = 300
)
fig9.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin = dict(l=20,r=20,t=40,b=20)
)
save_plot_as_html(fig9,"Scatter Graph 9.html","There is a weak correlation between the recency of updates and app ratings, suggesting that frequent updates do not necessarily lead to higher user satisfaction.")

In [51]:
#Figure 10
#Box plot(USeful to identify outliers)
#compare ratings of paid and free apps
fig10=px.box(
    apps_df,
    x = 'Type',
    y = 'Rating',
    color = 'Type',
    title = 'App Ratings for paid vs free apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width = 400,
    height = 300
)
fig10.update_layout(
    plot_bgcolor = 'black',
    paper_bgcolor = 'black',
    font_color = 'white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin = dict(l=20,r=20,t=40,b=20)
)  
save_plot_as_html(fig10,"Paid Free Graph 10.html","Paid apps tend to have slightly higher median ratings compared to free apps,suggesting that users expect more from paid applications.")

In [52]:
#embedding all plots in a single html file
plot_containers_split = plot_containers.split('</div>')

In [53]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

In [54]:
#DASHBOARD
dashboard_html= """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name=viewport" content="width=device-width,initial-scale-1.0">
    <title> Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container: hover .insights {{
            display: block;
        }}
        </style>
        <script>
            function openPlot(filename) {{
                window.open(filename, '_blank');
                }}
        </script>
    </head>
    <body>
        <div class= "header">
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
            <h1>Google Play Store Reviews Analytics</h1>
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
        </div>
        <div class="container">
            {plots}
        </div>
    </body>
    </html>
    """

In [55]:
#Save the dashboard html file #Use container to fill in plots
final_html = dashboard_html.format(plots=plot_containers,plot_width=plot_width,plot_height=plot_height)

In [56]:
#Save the dashboard to a html file
dashboard_path = os.path.join(html_files_path,"Web Page.html")

In [57]:
#path has to open the html file in browser
with open(dashboard_path,"w",encoding="utf-8") as f:
    f.write(final_html)

In [58]:
#open automatically in browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

True

In [60]:
from datetime import datetime, timezone, timedelta
from plotly.subplots import make_subplots

# Create grouped bar chart for avg rating and total reviews for top 10 categories by installs
# with filters: Avg rating >= 4.0 (category-level), app Size >= 10 (MB), Last Updated in January
# Show this graph in dashboard only between 15:00 and 17:00 IST (inclusive start, exclusive end)

import plotly.graph_objects as go

# determine current time in IST
now_utc = datetime.now(timezone.utc)
now_ist = now_utc + timedelta(hours=5, minutes=30)
hour_ist = now_ist.hour

# prepare filtered dataset (size and January last-updated)
filtered_apps = apps_df[
    (apps_df['Size'].notna()) &
    (apps_df['Size'] >= 10.0) &
    (apps_df['Last Updated'].notna()) &
    (apps_df['Last Updated'].dt.month == 1)
].copy()

# aggregate metrics by category
if not filtered_apps.empty:
    agg = (
        filtered_apps
        .groupby('Category', as_index=False)
        .agg(
            Avg_Rating=('Rating', 'mean'),
            Total_Reviews=('Reviews', 'sum'),
            Total_Installs=('Installs', 'sum'),
            App_Count=('App', 'count')
        )
    )
    # take top 10 by installs then filter categories with avg rating >= 4.0
    top_by_installs = agg.sort_values('Total_Installs', ascending=False).head(10)
    top_filtered = top_by_installs[top_by_installs['Avg_Rating'] >= 4.0]
else:
    top_filtered = agg = top_by_installs = None

# decide whether to show the plot (15:00 <= hour < 17:00 IST)
show_allowed = (15 <= hour_ist < 17)

filename = "Grouped Rating Reviews Graph 11.html"
insight_text = "Comparison of average rating and total review count for top categories (filtered by size>=10MB and last updated in January)."

global plot_containers

if show_allowed and top_filtered is not None and not top_filtered.empty:
    # create grouped bar chart with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(
        go.Bar(
            x=top_filtered['Category'],
            y=top_filtered['Avg_Rating'],
            name='Average Rating',
            marker_color='cyan',
            text=top_filtered['Avg_Rating'].round(2),
            textposition='auto'
        ),
        secondary_y=False
    )
    fig.add_trace(
        go.Bar(
            x=top_filtered['Category'],
            y=top_filtered['Total_Reviews'],
            name='Total Reviews',
            marker_color='magenta',
            text=top_filtered['Total_Reviews'],
            textposition='auto'
        ),
        secondary_y=True
    )

    fig.update_layout(
        title='Avg Rating vs Total Reviews (Top Categories by Installs)',
        barmode='group',
        width=plot_width,
        height=plot_height,
        plot_bgcolor='black',
        paper_bgcolor='black',
        font_color='white',
        title_font=title_font,
        margin=dict(l=20, r=20, t=40, b=40)
    )
    fig.update_yaxes(title_text="Average Rating", range=[0,5], secondary_y=False, title_font=axis_font)
    fig.update_yaxes(title_text="Total Reviews", secondary_y=True, title_font=axis_font)

    # save and append to dashboard containers
    save_plot_as_html(fig, filename, insight_text)
else:
    # do not show the graph in dashboard; append a placeholder container explaining reason
    if not show_allowed:
        reason = f"Graph available only between 3PM and 5PM IST. Current IST time: {now_ist.strftime('%Y-%m-%d %H:%M:%S')}"
    elif top_filtered is None or (top_filtered is not None and top_filtered.empty):
        reason = "No data meets the filtering criteria (Size>=10MB, Last Updated in January, and Avg Rating>=4.0)."
    else:
        reason = "Graph not available."

    placeholder_html = f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot" style="height:{plot_height}px; width:{plot_width}px;">
            <div style="color:{text_color}; padding:10px;">
                <h3>Grouped Rating & Reviews (Hidden)</h3>
                <p>{reason}</p>
            </div>
        </div>
        <div class="insights">{insight_text}</div>
    </div>
    """
    plot_containers += placeholder_html