TRAINING TASKS:-

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os


In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# Step 1: Load the Dataset
app_df=pd.read_csv("apps.csv")
rvw_df=pd.read_csv("user_reviews.csv")

In [4]:
app_df.head()

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [5]:
rvw_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [6]:
# Step 2: Data Cleaning
app_df = app_df.dropna(subset=['Rating'])
for column in app_df.columns:
    app_df[column].fillna(app_df[column].mode()[0], inplace=True)
app_df.drop_duplicates(inplace=True)
app_df = app_df[app_df['Rating'] != 5]
rvw_df.dropna(subset=['Translated_Review'], inplace=True)

In [7]:
# Merge datasets on 'App' and handle non-matching apps
merged_df = pd.merge(app_df, rvw_df, on='App', how='inner')

In [8]:
# Step 3: Data Transformation
app_df['Reviews'] = app_df['Reviews'].astype(int)
app_df['Installs'] = app_df['Installs'].str.replace(',', '').str.replace('+', '').astype(int)
app_df['Price'] = app_df['Price'].str.replace('$', '').astype(float)

In [9]:
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', ''))
    elif 'k' in size:
        return float(size.replace('k', '')) / 1024
    else:
        return np.nan

In [10]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.25,1.0
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725,0.833333
2,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.0,0.0
3,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.5,0.6
4,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I hate,Negative,-0.8,0.9


In [11]:
#lograrithmetic
#Add log_installs and log_reviews columns
app_df['Log_Installs']=np.log(app_df['Installs'])

In [12]:
app_df['Reviews']=app_df['Reviews'].astype(int)

In [13]:
app_df['Log_Reviews']=np.log(app_df['Reviews'])

In [14]:
app_df.dtypes

Unnamed: 0          int64
App                object
Category           object
Rating            float64
Reviews             int32
Size              float64
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
Log_Installs      float64
Log_Reviews       float64
dtype: object

In [15]:
# Add Rating Group column
def rating_group(rating):
    if rating>=4:
        return 'top rated app'
    elif rating >=3:
        return 'above average'
    elif rating>=2:
        return 'average'
    else:
        return 'below average'
app_df['Rating_Group']=app_df['Rating'].apply(rating_group)


In [16]:
## Add Revenue column
app_df['Revenue']=app_df['Price']*app_df['Installs']

In [17]:
# Sentiment Analysis
sia= SentimentIntensityAnalyzer()
rvw_df['Sentiment_Score'] = rvw_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [18]:
#polarity scores in SIA
#positive, negative , nuetral and compund

In [19]:
review='this app is amazing! i love the new features'
sentiment_score=sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.42, 'pos': 0.58, 'compound': 0.8516}


In [20]:
review='this app is very bad! i hate the new features'
sentiment_score=sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.535, 'neu': 0.465, 'pos': 0.0, 'compound': -0.8427}


In [21]:
review='this app is okay!'
sentiment_score=sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.578, 'pos': 0.422, 'compound': 0.2942}


In [22]:
rvw_df['sentiment_score']=rvw_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [23]:
rvw_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score,sentiment_score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369,0.6369
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369,0.6369


In [24]:
# Extract year from 'Last Updated' and create 'Year' column
app_df['Last Updated']=pd.to_datetime(app_df['Last Updated'],errors='coerce')

In [25]:
app_df['Year']=app_df['Last Updated'].dt.year

In [26]:
app_df.head()

Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.21034,5.068904,top rated app,0.0,2018
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,above average,0.0,2018
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424948,11.379508,top rated app,0.0,2018
3,3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281384,top rated app,0.0,2018
4,4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512925,6.874198,top rated app,0.0,2018


In [27]:
import plotly.express as px
fig=px.bar(x=["A","B","C"],y=[1,3,2],title="sample bar chat")
fig.show()

In [28]:
import os

# Define the path for your HTML files
html_files_path = "C:/Users/User/Desktop/google/html files"

# Make sure the directory exists
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

# Initialize plot_containers
plot_containers = ""

# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Define your plots
plot_width = 400
plot_height = 300
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}
# Category Analysis Plot
category_counts = app_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x': 'Category', 'y': 'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=plot_width,
    height=plot_height
)
fig1.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig1.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig1, "category_analysis.html", "The top categories on the Play Store are dominated by tools, entertainment, and productivity apps. This suggests users are looking for apps that either provide utility or offer leisure activities.")

# Type Analysis Plot
type_counts = app_df['Type'].value_counts()
fig2 = px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=plot_width,
    height=plot_height
)
fig2.update_traces(textposition='inside', textinfo='percent+label')
fig2.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig2, "type_analysis.html", "Most apps on the Play Store are free, indicating a strategy to attract users first and monetize through ads or in-app purchases.")

# Rating Distribution Plot
fig3 = px.histogram(
    app_df,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=plot_width,
    height=plot_height
)
fig3.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig3, "rating_distribution.html", "Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users.")

sentiment_counts = rvw_df['Sentiment_Score'].value_counts()
fig4 = px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x': 'Sentiment Score', 'y': 'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width=plot_width,
    height=plot_height
)
fig4.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig4.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig4, "sentiment_distribution.html", "Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments.")

# Installs by Category Plot
installs_by_category = app_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Installs by Category',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=plot_width,
    height=plot_height
)
fig5.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig5.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig5, "installs_by_category.html", "The categories with the most installs are social and communication apps, which reflects their broad appeal and daily usage.")

# Updates Per Year Plot
updates_per_year = app_df['Last Updated'].dt.year.value_counts().sort_index()
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#AB63FA'],
    width=plot_width,
    height=plot_height
)
fig6.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig6, "updates_per_year.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

# Revenue by Category Plot
revenue_by_category = app_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(
    x=revenue_by_category.index,
    y=revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Revenue'},
    title='Revenue by Category',
    color=revenue_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=plot_width,
    height=plot_height
)
fig7.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig7.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig7, "revenue_by_category.html", "Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential.")

# Genre Count Plot
genre_counts = app_df['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)
fig8 = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Top Genres',
    color=genre_counts.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=plot_width,
    height=plot_height
)
fig8.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig8.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig8, "genres_counts.html", "Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games.")

# Impact of Last Update on Rating
fig9 = px.scatter(
    app_df,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=plot_width,
    height=plot_height
)
fig9.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig9, "update_on_rating.html", "The scatter plot shows a weak correlation between the last update date and ratings, suggesting that more frequent updates don't always result in better ratings.")

# Ratings for Paid vs Free Apps
fig10 = px.box(
    app_df,
    x='Type',
    y='Rating',
    color='Type',
    title='Ratings for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=plot_width,
    height=plot_height
)
fig10.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig10, "ratings_paid_free.html", "Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for.")

# Split plot_containers to handle the last plot properly
plot_containers_split = plot_containers.split('</div>')
if len(plot_containers_split) > 1:
    final_plot = plot_containers_split[-2] + '</div>'
else:
    final_plot = plot_containers  # Use plot_containers as default if splitting isn't sufficient


In [29]:
# Define your HTML template string
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""


final_html="C:/Users/User/Desktop/google/dashboard.html"
# Format it with your dynamic variables
final_html = dashboard_html.format(plots=plot_containers, plot_width=plot_width, plot_height=plot_height)

# Save it to file
dashboard_path = os.path.join(html_files_path, "dashboard.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

# Open in browser
import webbrowser, os
webbrowser.open('file://' + os.path.realpath(dashboard_path))

True

INTERNSHIP TASKS:-

TASK 1:-

In [30]:
import plotly.express as px

# Filter for paid apps only
paid_apps = app_df[app_df['Type'] == 'Paid'].copy()
paid_apps['Revenue'] = paid_apps['Installs'] * paid_apps['Price']

# Plot
fig1 = px.scatter(
    paid_apps,
    x='Installs',
    y='Revenue',
    color='Category',
    title='Revenue vs Installs for Paid Apps',
    trendline='ols',
    hover_data=['App']
)
fig1.show()

TASK 2:-

In [31]:
import datetime
import pytz
import plotly.graph_objects as go

# Function to check if current time is within a given IST time range
def is_between_ist(start_hour, end_hour):
    ist = pytz.timezone('Asia/Kolkata')
    now_ist = datetime.datetime.now(ist)
    return start_hour <= now_ist.hour < end_hour

if is_between_ist(15, 17):  # Between 3 PM and 5 PM IST

    # Apply filters
    filtered = app_df[
        (app_df['Rating'] >= 4.0) &
        (app_df['Size'] >= 10 * 1024 * 1024) &  # 10 MB in bytes
        (app_df['Last Updated'].dt.month == 1)  # Only January
    ]

    if not filtered.empty:
        # Identify top 10 categories by total installs
        top_categories = (
            filtered.groupby('Category')['Installs'].sum()
            .nlargest(10).index
        )
        top_data = filtered[filtered['Category'].isin(top_categories)]

        # Compute metrics
        grouped = top_data.groupby('Category').agg({
            'Rating': 'mean',
            'Reviews': 'sum'
        }).reset_index()

        # Create grouped bar chart
        fig2 = go.Figure()

        fig2.add_trace(go.Bar(
            x=grouped['Category'],
            y=grouped['Rating'],
            name='Average Rating',
            marker_color='steelblue'
        ))

        fig2.add_trace(go.Bar(
            x=grouped['Category'],
            y=grouped['Reviews'],
            name='Total Reviews',
            marker_color='darkorange'
        ))

        fig2.update_layout(
            title="Top 10 Categories: Avg Rating vs Total Reviews (Filtered)",
            xaxis_title="App Category",
            yaxis_title="Value",
            barmode='group',
            plot_bgcolor=plot_bg_color,
            paper_bgcolor=plot_bg_color,
            font=dict(color=text_color),
            margin=dict(l=10, r=10, t=50, b=40)
        )

        save_plot_as_html(
            fig2,
            "task2_grouped_bar.html",
            "This chart compares average ratings and total reviews for the top 10 categories (filtered by high ratings, size ≥10MB, and updated in January)."
        )
task_html_files_path="C:/Users/User/Desktop/google/task_html_files"
fig2.write_html(os.path.join(task_html_files_path, "task2_grouped_bar.html"), full_html=False, include_plotlyjs='cdn')
save_path = os.path.join(task_html_files_path, "task2_grouped_bar.html")
fig2.write_html(save_path, full_html=False, include_plotlyjs='cdn')
print(f" Task 2 saved: {save_path}")



 Task 2 saved: C:/Users/User/Desktop/google/task_html_files\task2_grouped_bar.html


TASK 3:-

In [32]:
if is_between_ist(18, 20):  # 6 PM to 8 PM IST
    import plotly.express as px

    # Filter out unwanted categories
    valid_categories = app_df[~app_df['Category'].str.startswith(('A', 'C', 'G', 'S'))]

    # Get top 5 by total installs
    top5_categories = valid_categories.groupby('Category')['Installs'].sum().nlargest(5).index
    top5_df = valid_categories[valid_categories['Category'].isin(top5_categories)].copy()

    # Assign random or mock countries if real ones aren't present
    # In real case, replace with actual `Country` column
    country_mapping = ['US', 'IN', 'BR', 'GB', 'CA']  # Example countries
    top5_df['Country'] = [country_mapping[i % len(country_mapping)] for i in range(len(top5_df))]

    # Aggregate installs by country-category
    category_country_installs = top5_df.groupby(['Country', 'Category'])['Installs'].sum().reset_index()

    # Plot Choropleth map
    fig3 = px.choropleth(
        category_country_installs,
        locations='Country',
        color='Installs',
        hover_name='Category',
        color_continuous_scale='Viridis',
        title='Global Installs by Category (Top 5 Only)',
    )

    fig3.update_layout(
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font=dict(color=text_color),
        margin=dict(l=10, r=10, t=40, b=10)
    )

    # Save plot to HTML
    save_plot_as_html(
        fig3,
        "choropleth_top5.html",
        "Top 5 app categories by installs (excluding A, C, G, S). Countries with high installs are highlighted."
    )
save_path = os.path.join(task_html_files_path, "task3_choropleth_map.html")
fig3.write_html(save_path, full_html=False, include_plotlyjs='cdn')
print(f" Task 3 saved: {save_path}")


 Task 3 saved: C:/Users/User/Desktop/google/task_html_files\task3_choropleth_map.html


TASK 4:-

In [33]:
if is_between_ist(13, 14):  # 1 PM to 2 PM IST
    filtered = app_df[
        (app_df['Installs'] >= 10000) &
        (app_df['Price'] * app_df['Installs'] >= 10000) &
        (app_df['Android Ver'].str.extract(r'(\d+\.\d+)').astype(float).fillna(0)[0] > 4.0) &
        (app_df['Size'] >= 15 * 1024 * 1024) &
        (app_df['Content Rating'] == 'Everyone') &
        (app_df['App'].str.len() <= 30)
    ].copy()

    # Calculate revenue
    filtered['Revenue'] = filtered['Price'] * filtered['Installs']

    # Get top 3 categories by installs
    top3_cats = filtered.groupby('Category')['Installs'].sum().nlargest(3).index
    top3_df = filtered[filtered['Category'].isin(top3_cats)]

    # Group by Type and Category
    agg = top3_df.groupby(['Category', 'Type']).agg({
        'Installs': 'mean',
        'Revenue': 'mean'
    }).reset_index()

    # Pivot for dual-axis
    fig4 = px.bar(
        agg,
        x='Category',
        y='Installs',
        color='Type',
        barmode='group',
        labels={'Installs': 'Average Installs'},
        title='Average Installs and Revenue for Free vs Paid Apps (Top 3 Categories)'
    )

    # Add revenue as secondary y-axis
    for typ in agg['Type'].unique():
        sub = agg[agg['Type'] == typ]
        fig4.add_scatter(
            x=sub['Category'],
            y=sub['Revenue'],
            mode='lines+markers',
            name=f'Revenue ({typ})',
            yaxis='y2'
        )

    fig4.update_layout(
        yaxis=dict(title='Avg Installs'),
        yaxis2=dict(title='Avg Revenue', overlaying='y', side='right'),
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font=dict(color=text_color),
        margin=dict(l=10, r=10, t=40, b=10)
    )

    # Save plot
    save_plot_as_html(
        fig4,
        "dual_axis_free_vs_paid.html",
        "Comparison of average installs and revenue for Free vs Paid apps in top 3 categories."
    )
save_path = os.path.join(task_html_files_path, "task4_dual_axis_free_vs_paid.html")
fig4.write_html(save_path, full_html=False, include_plotlyjs='cdn')
print(f" Task 4 saved: {save_path}")


 Task 4 saved: C:/Users/User/Desktop/google/task_html_files\task4_dual_axis_free_vs_paid.html


TASK 5:-

In [34]:
if is_between_ist(16, 18):  # 4 PM to 6 PM IST
    # Apply filters
    filtered = app_df[
        (app_df['App'].str.contains('C', case=False)) &
        (app_df['Reviews'] >= 10) &
        (app_df['Rating'] < 4.0)
    ].copy()

    # Count apps per category
    category_counts = filtered['Category'].value_counts()
    valid_categories = category_counts[category_counts > 50].index

    # Filter for those categories only
    violin_df = filtered[filtered['Category'].isin(valid_categories)]

    # Plot violin
    fig5 = px.violin(
        violin_df,
        x='Category',
        y='Rating',
        box=True,
        points='all',
        title='Rating Distribution for App Categories (with “C” in name & Rating < 4.0)',
        color='Category'
    )

    fig5.update_layout(
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font=dict(color=text_color),
        margin=dict(l=10, r=10, t=40, b=10)
    )

    # Save it
    save_plot_as_html(
        fig5,
        "violin_rating_distribution.html",
        "This violin plot shows rating distributions for categories with >50 apps where the app name includes 'C' and the rating is below 4.0."
    )
save_path = os.path.join(task_html_files_path, "task5_violin_rating_distribution.html")
fig5.write_html(save_path, full_html=False, include_plotlyjs='cdn')
print(f" Task 5 saved: {save_path}")
  

 Task 5 saved: C:/Users/User/Desktop/google/task_html_files\task5_violin_rating_distribution.html


TASK 6:-

In [35]:
if is_between_ist(14, 16):  # 2 PM to 4 PM IST
    one_year_ago = pd.Timestamp.now() - pd.DateOffset(years=1)

    # Apply filters
    filtered = app_df[
        (app_df['Last Updated'] >= one_year_ago) &
        (app_df['Installs'] >= 100_000) &
        (app_df['Reviews'] > 1_000) &
        (~app_df['Genres'].str.startswith(tuple(['A', 'F', 'E', 'G', 'I', 'K'])))
    ].copy()

    # Select relevant columns and compute correlation
    corr_matrix = filtered[['Installs', 'Rating', 'Reviews']].corr()

    # Plot heatmap
    fig6 = px.imshow(
        corr_matrix,
        text_auto=True,
        color_continuous_scale='Viridis',
        title='Correlation Matrix: Installs, Ratings, and Review Counts'
    )

    fig6.update_layout(
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font=dict(color=text_color),
        margin=dict(l=10, r=10, t=40, b=10)
    )

    # Save it
    save_plot_as_html(
        fig6,
        "correlation_heatmap.html",
        "This heatmap highlights correlations between installs, ratings, and reviews for high-performing, recently updated apps."
    )
save_path = os.path.join(task_html_files_path, "task6_heatmap_correlation.html")
fig6.write_html(save_path, full_html=False, include_plotlyjs='cdn')
print(f" Task 6 saved: {save_path}")


 Task 6 saved: C:/Users/User/Desktop/google/task_html_files\task6_heatmap_correlation.html


TASK 7:-

In [36]:
if is_between_ist(18, 21):  # 6 PM to 9 PM IST
    # Step 1: Filter data
    filtered = app_df[
        (app_df['Content Rating'] == 'Teen') &
        (app_df['App'].str.startswith('E')) &
        (app_df['Installs'] > 10_000)
    ].copy()

    # Step 2: Extract month-year from 'Last Updated' for time trend
    filtered['Month'] = filtered['Last Updated'].dt.to_period('M').dt.to_timestamp()

    # Step 3: Group and aggregate installs by Category & Month
    trend_df = filtered.groupby(['Category', 'Month'])['Installs'].sum().reset_index()

    # Step 4: Calculate MoM % growth
    trend_df['MoM_Growth'] = trend_df.groupby('Category')['Installs'].pct_change() * 100

    # Step 5: Create line plot with shaded area for MoM growth > 20%
    fig7 = px.line(
        trend_df,
        x='Month',
        y='Installs',
        color='Category',
        title='Monthly Install Trends by Category (Teen, Apps starting with E)'
    )

    # Highlight significant growth
    for category in trend_df['Category'].unique():
        cat_data = trend_df[trend_df['Category'] == category]
        growth_periods = cat_data[cat_data['MoM_Growth'] > 20]

        fig7.add_scatter(
            x=growth_periods['Month'],
            y=growth_periods['Installs'],
            mode='markers',
            marker=dict(size=8, color='orange', symbol='circle'),
            name=f'Significant Growth ({category})',
            showlegend=False
        )

    fig7.update_layout(
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font=dict(color=text_color),
        margin=dict(l=10, r=10, t=40, b=10)
    )

    # Save it
    save_plot_as_html(
        fig7,
        "install_trend_line_chart.html",
        "This chart visualizes monthly installs for Teen-rated apps starting with 'E'. Orange dots highlight >20% month-over-month growth."
    )
save_path = os.path.join(task_html_files_path, "task7_time_series_installs.html")
fig7.write_html(save_path, full_html=False, include_plotlyjs='cdn')
print(f" Task 7 saved: {save_path}")


 Task 7 saved: C:/Users/User/Desktop/google/task_html_files\task7_time_series_installs.html
