In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os
from datetime import datetime
import base64 
from wordcloud import WordCloud, STOPWORDS
import pytz
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from PIL import Image
import pycountry

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
# Step 1: Load the Dataset
apps_df = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')

In [6]:
# Step 2: Data Cleaning
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating'] <= 5]
reviews_df.dropna(subset=['Translated_Review'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)


In [7]:
# Merge datasets on 'App' and handle non-matching apps
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')

In [8]:
# Step 3: Data Transformation
apps_df['Reviews'] = apps_df['Reviews'].astype(int)
apps_df['Installs'] = apps_df['Installs'].str.replace(',', '').str.replace('+', '').astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('$', '').astype(float)

In [9]:
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', ''))
    elif 'k' in size:
        return float(size.replace('k', '')) / 1024
    else:
        return np.nan

In [10]:
apps_df['Size'] = apps_df['Size'].apply(convert_size)

In [11]:
# Add log_installs and log_reviews columns
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])

In [12]:
# Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

In [13]:
# Add Revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

In [14]:
# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [15]:
# Extract year from 'Last Updated' and create 'Year' column
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [16]:
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.210440,5.075174,Top rated,0.0,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122365,6.875232,Above average,0.0,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424949,11.379520,Top rated,0.0,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281389,Top rated,0.0,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512935,6.875232,Top rated,0.0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.216606,2.079442,Top rated,0.0,2017
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517393,3.663562,Top rated,0.0,2017
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,2018-07-06,1.0,4.1 and up,4.615121,1.609438,Top rated,0.0,2018
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,6.908755,4.744932,Top rated,0.0,2015


In [17]:
fil = apps_df[(apps_df["Category"]=="GAME") & (apps_df["Rating"] > 3.5) & (apps_df["Installs"] > 50000)]
fil

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
1653,ROBLOX,GAME,4.5,4447388,67.0,100000000,Free,0.0,Everyone 10+,Adventure;Action & Adventure,2018-07-31,2.347.225742,4.1 and up,18.420681,15.307828,Top rated,0.0,2018
1654,Subway Surfers,GAME,4.5,27722264,76.0,1000000000,Free,0.0,Everyone 10+,Arcade,2018-07-12,1.90.0,4.1 and up,20.723266,17.137746,Top rated,0.0,2018
1655,Candy Crush Saga,GAME,4.4,22426677,74.0,500000000,Free,0.0,Everyone,Casual,2018-07-05,1.129.0.2,4.1 and up,20.030119,16.925762,Top rated,0.0,2018
1656,Solitaire,GAME,4.7,254258,23.0,10000000,Free,0.0,Everyone,Card,2018-08-01,2.137.0,4.1 and up,16.118096,12.446109,Top rated,0.0,2018
1657,Bubble Shooter,GAME,4.5,148897,46.0,10000000,Free,0.0,Everyone,Casual,2018-07-17,1.20.1,4.0.3 and up,16.118096,11.911017,Top rated,0.0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10784,Big Hunter,GAME,4.3,245455,84.0,10000000,Free,0.0,Everyone 10+,Action,2018-05-31,2.8.6,4.0 and up,16.118096,12.410873,Top rated,0.0,2018
10792,Soccer Clubs Logo Quiz,GAME,4.2,21661,16.0,1000000,Free,0.0,Everyone,Trivia,2018-05-24,1.3.81,4.0 and up,13.815512,9.983315,Top rated,0.0,2018
10793,Sid Story,GAME,4.4,28510,78.0,500000,Free,0.0,Teen,Card,2018-08-01,2.6.6,4.0.3 and up,13.122365,10.258045,Top rated,0.0,2018
10803,Fatal Raid - No.1 Mobile FPS,GAME,4.3,56496,81.0,1000000,Free,0.0,Teen,Action,2018-08-07,1.5.447,4.0 and up,13.815512,10.941943,Top rated,0.0,2018


In [18]:
fil.info()

<class 'pandas.core.frame.DataFrame'>
Index: 866 entries, 1653 to 10804
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             866 non-null    object        
 1   Category        866 non-null    object        
 2   Rating          866 non-null    float64       
 3   Reviews         866 non-null    int32         
 4   Size            752 non-null    float64       
 5   Installs        866 non-null    int32         
 6   Type            866 non-null    object        
 7   Price           866 non-null    float64       
 8   Content Rating  866 non-null    object        
 9   Genres          866 non-null    object        
 10  Last Updated    866 non-null    datetime64[ns]
 11  Current Ver     866 non-null    object        
 12  Android Ver     866 non-null    object        
 13  Log_Installs    866 non-null    float64       
 14  Log_Reviews     866 non-null    float64       
 15  Rating

In [None]:
import plotly.express as px

# Define the path for your HTML files
html_files_path = "./"

# Make sure the directory exists
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

# Initialize plot_containers
plot_containers = ""

# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Define your plots
plot_width = 400
plot_height = 300
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

# Category Analysis Plot
category_counts = apps_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x': 'Category', 'y': 'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=plot_width,
    height=plot_height
)
fig1.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig1.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig1, "category_analysis.html", "The top categories on the Play Store are dominated by tools, entertainment, and productivity apps. This suggests users are looking for apps that either provide utility or offer leisure activities.")

# Type Analysis Plot
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=plot_width,
    height=plot_height
)
fig2.update_traces(textposition='inside', textinfo='percent+label')
fig2.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig2, "type_analysis.html", "Most apps on the Play Store are free, indicating a strategy to attract users first and monetize through ads or in-app purchases.")

# Rating Distribution Plot
fig3 = px.histogram(
    apps_df,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=plot_width,
    height=plot_height
)
fig3.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig3, "rating_distribution.html", "Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users.")

sentiment_counts = reviews_df['Sentiment_Score'].value_counts()
fig4 = px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x': 'Sentiment Score', 'y': 'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width=plot_width,
    height=plot_height
)
fig4.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig4.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig4, "sentiment_distribution.html", "Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments.")

# Installs by Category Plot
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Installs by Category',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=plot_width,
    height=plot_height
)
fig5.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig5.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig5, "installs_by_category.html", "The categories with the most installs are social and communication apps, which reflects their broad appeal and daily usage.")

# Updates Per Year Plot
updates_per_year = apps_df['Last Updated'].dt.year.value_counts().sort_index()
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#AB63FA'],
    width=plot_width,
    height=plot_height
)
fig6.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig6, "updates_per_year.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

# Revenue by Category Plot
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(
    x=revenue_by_category.index,
    y=revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Revenue'},
    title='Revenue by Category',
    color=revenue_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=plot_width,
    height=plot_height
)
fig7.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig7.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig7, "revenue_by_category.html", "Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential.")

# Genre Count Plot
genre_counts = apps_df['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)
fig8 = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Top Genres',
    color=genre_counts.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=plot_width,
    height=plot_height
)
fig8.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig8.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig8, "genres_counts.html", "Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games.")

# Impact of Last Update on Rating
fig9 = px.scatter(
    apps_df,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=plot_width,
    height=plot_height
)
fig9.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig9, "update_on_rating.html", "The scatter plot shows a weak correlation between the last update date and ratings, suggesting that more frequent updates don't always result in better ratings.")

# Ratings for Paid vs Free Apps
fig10 = px.box(
    apps_df,
    x='Type',
    y='Rating',
    color='Type',
    title='Ratings for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=plot_width,
    height=plot_height
)
fig10.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig10, "ratings_paid_free.html", "Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for.")

#THE WORDCLOUD - TASK 1

merged_df = merged_df.dropna(subset=['Category', 'Rating', 'Translated_Review'])
filter_df = merged_df[(merged_df['Rating']>=4.8) & (merged_df['Category']=="HEALTH_AND_FITNESS")]
text = " ".join(filter_df['Translated_Review'].dropna())
stopwords = set(STOPWORDS)
custom_stops = set(filter_df['App'].unique())
stopwords.update(custom_stops)

wc = WordCloud(stopwords = stopwords, background_color = "white").generate(text)
# plt.imshow(wc, interpolation='bilinear')
# plt.axis('off')
# plt.show()

wordcloud_path = "wordcloud.png"
if os.path.exists(wordcloud_path):
    with open(wordcloud_path, "rb") as image_file:
        wordcloud_b64 = base64.b64encode(image_file.read()).decode()
    wordcloud_html = f""""
    <div class="plot_container" id="wordclod.png" onclick = "openPlot('wordcloud.png')">
        <div class="plot"><img src="data:image/png;base64,{wordcloud_b64}" alt="wordcloud" style="width:400px; height:300px; position:relative;"/></div>
        <div class="insights">The WordCloud shows the most frequent words from the reviews.</div>
    </div>"""
    plot_containers +=wordcloud_html
else:
    print("WordCloud  image not found. PLease check again")
    
#THE DUAL-AXIS CHART  --  TASK 2

# apps_df.head()
# apps_df.info()

top_3_cat = apps_df['Category'].value_counts().nlargest(3).index  #These are GAME, FAMILY, TOOLS
# top_3_cat
df = apps_df[apps_df['Category'].isin(top_3_cat)] #this stores only the data of top_3_catrgories games !!
# print('The {} apps are as follow'.format(df.Category.unique()))

#To ensure that the Andriod ver is in the required formt 
import re
def extract_major_minor(version_str):
    try:
        version_str = version_str.strip()
        # Look for a pattern like digits dot digits (e.g., "4.0" from "4.0.3 and up")
        match = re.search(r'(\d+\.\d+)', version_str)
        if match:
            return float(match.group(1))
        else:
            return None
    except Exception as e:
        print("Error:", e)
        return None

# Create a new numeric version column to only strore the numeric values(i.e, the float) of the andriod version!!!
apps_df['Android Ver Numeric'] = apps_df['Android Ver'].apply(extract_major_minor)
# print(apps_df['Android Ver Numeric'].head())
# apps_df 

df_dual = apps_df.copy()
# df_dual
df_dual = df_dual[
    (df_dual['Installs'] > 10000) &
    (df_dual['Revenue'] > 10000) &
    (df_dual['Android Ver Numeric'] > 4.0) &
    (df_dual['Size'] > 15.0) &
    (df_dual['Content Rating'] == 'Everyone') &
    (df_dual['App'].apply(lambda x: len(x) <=30))
]
top_3_cat = df_dual['Category'].value_counts().nlargest(3).index
df_dual = df_dual[df_dual['Category'].isin(top_3_cat)]

grouped_data = df_dual.groupby(['Category', 'Type']).agg({
    'Installs' : 'mean',
    'Revenue' : 'mean',
}).reset_index()

#time condition !!!
ist = pytz.timezone('Asia/Kolkata')
curr_time = datetime.now(ist).time()
start = datetime.strptime("13:00", "%H:%M").time()
end = datetime.strptime("14:00", "%H:%M").time()
if start <= curr_time < end:
    fig12 = make_subplots(specs=[[{"secondary_y":True}]])

    for app_type in grouped_data['Type'].unique():
        sub_df = grouped_data[grouped_data['Type']==app_type]
        fig12.add_trace(
            go.Bar(
                x = sub_df['Category'],
                y = sub_df['Installs'],
                name = f'Average Installs ({app_type})',
                marker = dict(color='blue' if app_type.lower()=='free' else 'green')
            ),
            secondary_y = False            
        )
        
        fig12.add_trace(
            go.Scatter(
                x = sub_df['Category'],
                y = sub_df['Revenue'],
                name = f'Average Revenue ({app_type})',
                mode = 'lines+markers',
                marker = dict(color='Orange' if app_type.lower()=='free' else 'red')
            ),
            secondary_y=True            
        )
    fig12.update_layout(
        title = "Installs and Revenue for Free vs Paid Apps for Top 3 categories",
        xaxis_title="App Category",
        legend_title="Legend",
        plot_bgcolor="black",
        paper_bgcolor="black",
        font_color="white",
        width=plot_width,
        height=plot_height,
        margin = dict(l=10, r=10, t=30, b=10)
    )
    fig12.update_yaxes(title_text="Average Installs", secondary_y=False)
    fig12.update_yaxes(title_text="Average Revenue", secondary_y=True)
    save_plot_as_html(fig12, "dual_axis_chart.html", "This dual-axis chart shows the relationship between the Category with the installs and revenue for the top-3 apps")

# THE CHOROPLETH MAP - TASK 3

''' FOR THE IMPLEMENTATION OF THE CHRORPLETH MAP, THE COUNTRIES DATASET IS MUST. BUT THE GIVEN DATASET OF THE GOOGLE PLAYSTORE DOESN'T CONTAIN THE 
COUTRIES COLUMN OR THE SEPARATE DATA FOR THE COUNTIRES. 
                                        !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
SINCE THE COUNTRIES DATASET ISN'T PRESENT, I'M IMPLEMENTING THIS MAP USING THE "pycountries" module. THIS MODULE HAS INBUILT COUNTRIES DATA, THAT 
HELPS US TO PLOT THE WORLD MAP (AN EMPTY WORLD MAP, BECAUSE WE CAN'T IMPLEMENT THE GIVEN DATA BECAUSE THE COUNTRIES AREN'T PRESENT IN DATASET.
                                        !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
I ALSO IMPLEMENTED ALL THE FILTERS GIVEN IN THE QUESTION, BUT I CAN'T PLOT IT BECACUSE THE COUNTRIES DATA ISN'T PRESENT.
FOR THIS REASON I'M PLOTTING AN EMPTY WORLD MAP. PLEASE CONSIDER THIS. 
                                        !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

'''
df = df[~df['Category'].str.startswith(('A', 'C', 'G', 'S'))]
# Aggregate installs by country and category
df_grouped = df.groupby(['Category']).agg({'Installs': 'sum'}).reset_index()
#HERE THE COUNTRY COLUMN TO BE ADDED, BUT NOT ADDED BECAUSE IT ISN'T IN THE DATASET !!

# Get top 5 categories by total installs
top_categories = df_grouped.groupby('Category')['Installs'].sum().nlargest(5).index
df_filtered = df_grouped[df_grouped['Category'].isin(top_categories)]

current_hour = datetime.now().hour

if 18<= current_hour < 20:
    country_codes = [country.alpha_3 for country in pycountry.countries]

    # Create dummy DataFrame
    dummy_df = pd.DataFrame({
        "Country": country_codes,
        "Installs": [0] * len(country_codes),
        "Category": ["Empty"] * len(country_codes)
    })

    # Create the choropleth map
    choropleth_fig = px.choropleth(
        dummy_df,
        locations="Country",
        locationmode="ISO-3",  # Using ISO-3 country codes
        color="Installs",
        color_continuous_scale="Viridis",
        title="Global Installs Map"
    )
    
    # Update layout
    choropleth_fig.update_layout(
        margin=dict(l=10, r=10, t=30, b=10),
        paper_bgcolor=plot_bg_color,
        plot_bgcolor=plot_bg_color,
        font=dict(color="white"),
        width=plot_width,
        height=plot_height
    )
    
    save_plot_as_html(choropleth_fig, "choropleth.html", "This shows the empty world map since the country column isn't provided in the dataset. The filters are implemented in the code")
    

#BUBBLE CHART - TASK 4 !!!

current_hour = datetime.now().hour
if 17<=current_hour<19:    #This one is only available from 5pm to 7 pm !!!
    #already filtered the data
    fig14 = px.scatter(
        fil,
        x = 'Size',
        y = 'Rating',
        size = 'Installs',
        color = 'Category',
        title = "Relationship between App Size and Rating",
        hover_name = "App",
        width = plot_width,
        height = plot_height,
    )
    fig14.update_layout(
        plot_bgcolor=plot_bg_color,
        paper_bgcolor = plot_bg_color,
        font_color = text_color,
        title_font = title_font,
        xaxis = dict(title_font = axis_font),
        yaxis = dict(title_font = axis_font),
        margin = dict(l=10, r=10, t=30, b=10)
    )
    save_plot_as_html(fig14, "BubbleChart.html", "This is the bubble chart showing the relationship between the app size and the Rating")

# TIME SERIES CHART - TASK 5

time_df = apps_df[(apps_df["Content Rating"]=="Teen") & (apps_df["Installs"] > 10000) & (apps_df['App'].str.startswith("E"))]
time_df["Month"] = time_df["Last Updated"].dt.to_period("M").dt.to_timestamp()
ts_data = time_df.groupby(['Category', 'Month'])['Installs'].sum().reset_index()

ist = pytz.timezone('Asia/Kolkata')
curr_time = datetime.now(ist).time()
start = datetime.strptime("18:00", "%H:%M").time()
end = datetime.strptime("21:00", "%H:%M").time() 
if start<= curr_time < end:
    fig15 = go.Figure()    
    categories = ts_data['Category'].unique()
    for cat in categories:
        cat_data = ts_data[ts_data['Category']==cat].sort_values("Month").copy()
        cat_data["percentage_change"] = cat_data['Installs'].pct_change()
        
        fig15.add_trace(
            go.Scatter(
                x=cat_data["Month"],
                y=cat_data["Installs"],
                mode="lines+markers",
                name=cat
            )
        )
        
        for i in range(1, len(cat_data)):
            if cat_data.iloc[i]["percentage_change"] > 0.2:
                x0 = cat_data.iloc[i-1]['Month']
                x1 = cat_data.iloc[i]['Month']
                fig15.add_vrect(
                    x0=x0, x1=x1,
                    fillcolor = "rgba(0, 255, 0, 0.2)",
                    layer="below",
                    line_width = 0,
                    annotation_text=">20% growth",
                    annotation_position="top left",
                )
                
    fig15.update_layout(
        xaxis_title="Time in Months",
        yaxis_title="Total Installs",
        plot_bgcolor="black",
        paper_bgcolor="black",
        font=dict(color="white"),
        width=plot_width,
        height = plot_height
    )
    save_plot_as_html(fig15, "Time_axis_chart.html","This time series chart shows the total install over time, segmented by app category, shaded regions indicate periods with over 20% monthly growth")


# Split plot_containers to handle the last plot properly
plot_containers_split = plot_containers.split('</div>')
if len(plot_containers_split) > 1:
    final_plot = plot_containers_split[-2] + '</div>'
else:
    final_plot = plot_containers  # Use plot_containers as default if splitting isn't sufficient

# HTML template for the dashboard
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Reviews Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0, 0, 0, 0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""

# Use these containers to fill in your dashboard HTML
final_html = dashboard_html.format(plots=plot_containers, plot_width=plot_width, plot_height=plot_height)

# Save the final dashboard to an HTML file
dashboard_path = os.path.join(html_files_path, "dashboard.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



True