In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os
from google.colab import files

In [4]:
# Download NLTK lexicon
nltk.download('vader_lexicon')

# Upload both CSV files
uploaded = files.upload()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Saving User Reviews.csv to User Reviews.csv
Saving Play Store Data.csv to Play Store Data.csv


In [5]:
# Load datasets
apps_df = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')


In [6]:
# Data cleaning
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating'] <= 5]
reviews_df.dropna(subset=['Translated_Review'], inplace=True)

apps_df['Installs'] = apps_df['Installs'].str.replace(',', '').str.replace('+', '').astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('$', '').astype(float)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)


In [7]:
# Merge datasets
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')


In [10]:
# Convert size
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', ''))
    elif 'k' in size:
        return float(size.replace('k', '')) / 1024
    else:
        return np.nan
apps_df['Size'] = apps_df['Size'].apply(convert_size)

In [11]:
# Log transforms
apps_df['Log_Installs'] = np.log(apps_df['Installs'])
apps_df['Reviews'] = apps_df['Reviews'].astype(int)
apps_df['Log_Reviews'] = np.log(apps_df['Reviews'])

In [12]:
# Rating group
def rating_group(rating):
    if rating >= 4:
        return 'Top rated app'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below Average'
apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

In [13]:
# Revenue
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

# Sentiment analysis
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

# Date conversion
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

# Dashboard setup
html_files_path = "./"
os.makedirs(html_files_path, exist_ok=True)
plot_containers = ""

In [14]:
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')
# Style
plot_width = 400
plot_height = 300
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

In [15]:
# Plot 1: Category Distribution
category_counts = apps_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(x=category_counts.index, y=category_counts.values, labels={'x':'Category','y':'Count'},
              title='Top Categories on Play Store', color=category_counts.index,
              color_discrete_sequence=px.colors.sequential.Plasma, width=plot_width, height=plot_height)
fig1.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                   margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig1, "Category Graph 1.html", "Top categories: Tools, Entertainment, Productivity")

In [16]:
# Plot 2: Type Distribution
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(values=type_counts.values, names=type_counts.index, title='App Type Distribution',
              color_discrete_sequence=px.colors.sequential.RdBu, width=plot_width, height=plot_height)
fig2.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig2, "Type Graph 2.html", "Most apps are Free, monetized via ads or purchases")


In [17]:
# Plot 3: Rating Histogram
fig3 = px.histogram(apps_df, x='Rating', nbins=20, title='Rating Distribution',
                   color_discrete_sequence=['#636EFA'], width=plot_width, height=plot_height)
fig3.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                   margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig3, "Rating Graph 3.html", "Ratings are skewed positively")

In [18]:
# Plot 4: Sentiment Distribution
sentiment_counts = reviews_df['Sentiment_Score'].value_counts()
fig4 = px.bar(x=sentiment_counts.index, y=sentiment_counts.values,
              labels={'x':'Sentiment Score','y':'Count'}, title='Sentiment Distribution',
              color=sentiment_counts.index, color_discrete_sequence=px.colors.sequential.RdPu,
              width=plot_width, height=plot_height)
fig4.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                   margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig4, "Sentiment Graph 4.html", "Most reviews lean slightly positive")

In [19]:
# Plot 5: Installs by Category
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(x=installs_by_category.index, y=installs_by_category.values,
              labels={'x':'Category','y':'Installs'}, title='Installs by Category',
              color=installs_by_category.index, color_discrete_sequence=px.colors.sequential.Blues,
              width=plot_width, height=plot_height)
fig5.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                   margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig5, "Installs Graph 5.html", "Communication & Social apps lead in installs")

In [20]:
# Plot 6: Updates per Year
updates_per_year = apps_df['Last Updated'].dt.year.value_counts().sort_index()
fig6 = px.line(x=updates_per_year.index, y=updates_per_year.values,
               labels={'x': 'Year', 'y': 'Number of Updates'}, title='Number of Updates Over the Years',
               color_discrete_sequence=['#AB63FA'], width=plot_width, height=plot_height)
fig6.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                   margin=dict(l=10, r=10, t=30, b=10))
save_plot_as_html(fig6, "Updates Graph 6.html", "Updates show active maintenance by developers")

In [21]:
# Plot 7: Revenue by Category
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(x=revenue_by_category.index, y=revenue_by_category.values,
              labels={'x':'Category','y':'Revenue'}, title='Revenue by Category',
              color=revenue_by_category.index, color_discrete_sequence=px.colors.sequential.Greens,
              width=plot_width, height=plot_height)
fig7.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                   margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig7, "Revenue Graph 7.html", "Business & Productivity apps earn the most")

In [22]:
# Plot 8: Top Genres
genre_counts = apps_df['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)
fig8 = px.bar(x=genre_counts.index, y=genre_counts.values,
              labels={'x':'Genre','y':'Count'}, title='Top Genres',
              color=genre_counts.index, color_discrete_sequence=px.colors.sequential.OrRd,
              width=plot_width, height=plot_height)
fig8.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                   margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig8, "Genre Graph 8.html", "Action and Casual genres dominate")

In [23]:
# Plot 9: Update Date vs Rating
fig9 = px.scatter(apps_df, x='Last Updated', y='Rating', color='Type',
                  title='Impact of Last Update on Rating',
                  color_discrete_sequence=px.colors.qualitative.Vivid,
                  width=plot_width, height=plot_height)
fig9.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                   title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                   margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig9, "Update Graph 9.html", "Frequent updates don’t guarantee higher ratings")

In [24]:
# Plot 10: Ratings - Paid vs Free
fig10 = px.box(apps_df, x='Type', y='Rating', color='Type',
               title='Rating for Paid vs Free Apps',
               color_discrete_sequence=px.colors.qualitative.Pastel,
               width=plot_width, height=plot_height)
fig10.update_layout(plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color, font_color=text_color,
                    title_font=title_font, xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
                    margin=dict(l=10,r=10,t=30,b=10))
save_plot_as_html(fig10, "Paid Free Graph 10.html", "Paid apps tend to have better ratings")

In [25]:
# Final dashboard HTML
print("Number of plots:", plot_containers.count('plot-container'))

dashboard_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width,initial-scale=1.0">
    <title>Google Play Store Review Analytics</title>
    <style>
        body {{ font-family: Arial, sans-serif; background-color: #333; color: #fff; margin: 0; padding: 0; }}
        .header {{ display: flex; align-items: center; justify-content: center; padding: 20px; background-color: #444 }}
        .header img {{ margin: 0 10px; height: 50px; }}
        .container {{ display: flex; flex-wrap: wrap; justify-content: center; padding: 20px; }}
        .plot-container {{ border: 2px solid #555; margin: 10px; padding: 10px; width: {plot_width}px; height: {plot_height}px; overflow: hidden; position: relative; cursor: pointer; }}
        .insights {{ display: none; position: absolute; right: 10px; top: 10px; background-color: rgba(0,0,0,0.7); padding: 5px; border-radius: 5px; color: #fff; }}
        .plot-container:hover .insights {{ display: block; }}
    </style>
    <script>
        function openPlot(filename) {{ window.open(filename, '_blank'); }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plot_containers}
    </div>
</body>
</html>
"""

dashboard_path = os.path.join(html_files_path, "web_page.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(dashboard_html)

# Launch in browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

Number of plots: 30


False

In [26]:
from google.colab import files

# Download the dashboard HTML file
files.download(dashboard_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Assinged tasks

In [9]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import pytz

# --- Make a working copy of merged_df ---
df = merged_df.copy()

# --- Convert Last Updated to datetime ---
df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')

# --- Clean Size column to MB ---
def convert_size(x):
    if isinstance(x, str):
        x = x.strip()
        if x.endswith("M"):
            return float(x.replace("M", ""))
        elif x.endswith("k"):
            return float(x.replace("k", "")) / 1024  # KB → MB
        else:
            return None  # e.g. "Varies with device"
    return x

df["Size"] = df["Size"].apply(convert_size)

# --- Get IST hour ---
ist = pytz.timezone("Asia/Kolkata")
now = datetime.now(ist)
hour = now.hour

# =========================
# TASK 1: Grouped Bar Chart
# =========================
if 15 <= hour < 17:  # 3PM–5PM IST
    task1 = df.copy()
    task1 = task1[(task1['Rating'] >= 4.0) &
                  (task1['Size'] >= 10) &
                  (task1['Last Updated'].dt.month == 1)]

    top10 = (task1.groupby("Category")
                   .agg({"Rating": "mean", "Reviews": "sum", "Installs": "sum"})
                   .sort_values("Installs", ascending=False)
                   .head(10)
                   .reset_index())

    fig1 = go.Figure(data=[
        go.Bar(name="Avg Rating", x=top10["Category"], y=top10["Rating"]),
        go.Bar(name="Total Reviews", x=top10["Category"], y=top10["Reviews"])
    ])
    fig1.update_layout(barmode="group", title="Top 10 Categories: Rating & Reviews")
    fig1.show()

# =========================
# TASK 2: Choropleth Map
# =========================
if 18 <= hour < 20:  # 6PM–8PM IST
    task2 = df.copy()
    task2 = task2[~task2["Category"].str.startswith(("A", "C", "G", "S"))]
    top5 = (task2.groupby("Category")["Installs"]
                  .sum()
                  .sort_values(ascending=False)
                  .head(5)
                  .reset_index())

    # ✅ If Country column exists → choropleth, else → bar chart
    if "Country" in df.columns:
        task2 = task2[task2["Category"].isin(top5["Category"])]
        fig2 = px.choropleth(task2, locations="Country", color="Category",
                             hover_name="Category", animation_frame=None,
                             title="Global Installs by Category (Top 5)")
    else:
        fig2 = px.bar(top5, x="Category", y="Installs",
                      title="Top 5 Categories by Installs (No Country data)")
    fig2.show()

# =========================
# TASK 3: Dual-Axis Chart
# =========================
if 13 <= hour < 14:  # 1PM–2PM IST
    task3 = df.copy()
    task3 = task3[
        (task3["Installs"] >= 10000) &
        (task3["Revenue"] >= 10000) &
        (task3["Android Ver"].astype(str).str.extract(r"(\d+\.\d+)").astype(float) > 4.0).values &
        (task3["Size"] >= 15) &
        (task3["Content Rating"] == "Everyone") &
        (task3["App"].str.len() <= 30)
    ]

    top3 = task3.groupby("Category")["Installs"].sum().nlargest(3).index
    task3 = task3[task3["Category"].isin(top3)]

    grouped = task3.groupby(["Category", "Type"]).agg({
        "Installs": "mean",
        "Revenue": "mean"
    }).reset_index()

    fig3 = make_subplots(specs=[[{"secondary_y": True}]])
    for t in ["Free", "Paid"]:
        sub = grouped[grouped["Type"] == t]
        fig3.add_trace(go.Bar(x=sub["Category"], y=sub["Installs"], name=f"{t} Installs"), secondary_y=False)
        fig3.add_trace(go.Scatter(x=sub["Category"], y=sub["Revenue"], name=f"{t} Revenue"), secondary_y=True)
    fig3.update_layout(title="Free vs Paid Apps: Installs & Revenue")
    fig3.show()

# =========================
# TASK 4: Time Series Line Chart
# =========================
if 18 <= hour < 21:  # 6PM–9PM IST
    task4 = df.copy()
    task4 = task4[
        (~task4["App"].str.lower().str.startswith(("x", "y", "z"))) &
        (task4["Category"].str.startswith(("E", "C", "B"))) &
        (task4["Reviews"] > 500) &
        (~task4["App"].str.contains("S"))
    ]

    # ✅ Translations
    translations = {"Beauty": "सौंदर्य", "Business": "வணிகம்", "Dating": "Dating (Deutsch)"}
    task4["Category"] = task4["Category"].replace(translations)

    trend = (task4.groupby([pd.Grouper(key="Last Updated", freq="M"), "Category"])["Installs"]
                  .sum()
                  .reset_index())

    fig4 = px.line(trend, x="Last Updated", y="Installs", color="Category",
                   title="Monthly Installs Trend by Category")
    fig4.show()

# =========================
# TASK 5: Bubble Chart
# =========================
if 17 <= hour < 19:  # 5PM–7PM IST
    task5 = df.copy()
    allowed = ["Game", "Beauty", "Business", "Comics", "Communication", "Dating", "Entertainment", "Social", "Events"]
    task5 = task5[
        (task5["Rating"] > 3.5) &
        (task5["Category"].isin(allowed)) &
        (task5["Reviews"] > 500) &
        (~task5["App"].str.contains("S")) &
        (task5["Sentiment_Subjectivity"] > 0.5) &
        (task5["Installs"] > 50000)
    ]

    # ✅ Translations
    translations = {"Beauty": "सौंदर्य", "Business": "வணிகம்", "Dating": "Dating (Deutsch)"}
    task5["Category"] = task5["Category"].replace(translations)

    fig5 = px.scatter(task5, x="Size", y="Rating", size="Installs",
                      color="Category", title="Bubble Chart: Size vs Rating vs Installs")

    # Highlight Game category bubbles in pink
    fig5.update_traces(marker=dict(color="pink"), selector=dict(mode="markers", legendgroup="Game"))
    fig5.show()
