In [56]:
# Import Required Libraries

# Libraries for Data Analytics
import pandas as pd
import numpy as np

# Libraries for Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings('ignore')

# Libraries for Machine Learning (ML), Deep Learning (DL) and Natural Language Processing (NLP)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
import tensorflow as tf
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Other Libraries
import webbrowser
import os
from datetime import datetime
from datetime import timedelta
import pytz

# Downloading the Required NLP Package
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\Anirban
[nltk_data]     Majumder\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [57]:
# Load the datasets
ps_data = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')

In [58]:
# Visualize the datasets
ps_data.head(20)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6M,"50,000+",Free,0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19M,"50,000+",Free,0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,29M,"1,000,000+",Free,0,Everyone,Art & Design,"June 14, 2018",6.1.61.1,4.2 and up
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33M,"1,000,000+",Free,0,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,3.1M,"10,000+",Free,0,Everyone,Art & Design;Creativity,"July 3, 2018",2.8,4.0.3 and up


In [59]:
reviews_df.head(20)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3
6,10 Best Foods for You,Amazing,Positive,0.6,0.9
7,10 Best Foods for You,,,,
8,10 Best Foods for You,"Looking forward app,",Neutral,0.0,0.0
9,10 Best Foods for You,It helpful site ! It help foods get !,Neutral,0.0,0.0


In [60]:
# Check the Datatypes of the Datasets
ps_data.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [61]:
reviews_df.dtypes

App                        object
Translated_Review          object
Sentiment                  object
Sentiment_Polarity        float64
Sentiment_Subjectivity    float64
dtype: object

In [62]:
# Data Cleaning

ps_data = ps_data.dropna(subset=['Rating'])
for column in ps_data.columns:
    ps_data[column].fillna(ps_data[column].mode()[0], inplace = True)
ps_data.drop_duplicates(inplace=True)
ps_data=ps_data[ps_data['Rating']<=5]
reviews_df.dropna(subset=['Translated_Review'],inplace=True)

In [63]:
# Convert the 'Installs' column to numeric
ps_data['Installs'] = ps_data['Installs'].str.replace(',','').str.replace('+','').astype(int)

# Convert the 'Price' column to numeric by removing the currency sign
ps_data['Price'] = ps_data['Price'].str.replace('$','').astype(float)

# Convert the 'Size' column from object to numeric, by removing the size unit
def modify_size(size):
    if 'M' in size:
        return float(size.replace('M', ''))
    elif 'k' in size:
        return float(size.replace('k', '')) / 1024
    else:
        return np.nan

ps_data['Size'] = ps_data['Size'].apply(modify_size)

# Preprocess other columns

# Convert the 'Reviews' column to a numeric column
ps_data['Reviews'] = ps_data['Reviews'].astype(int)

# Loagrithmic operations
ps_data['Log_Installs'] = np.log(ps_data['Installs'])
ps_data['Log_Reviews'] = np.log(ps_data['Reviews'])

# Create Rating Group
def rating_group(rating):
    if rating >= 4:
        return 'Top Rated App'
    elif rating >= 3:
        return 'Above Average App'
    elif rating >= 2:
        return 'Average App'
    else:
        return 'Below Average App'

ps_data['Rating Group'] = ps_data['Rating'].apply(rating_group)

# Revenue Column
ps_data['Revenue'] = ps_data['Installs'] * ps_data['Price']

# Imputing the Null Values
numerical_cols = ps_data.select_dtypes(include=['number']).columns
categorical_cols = ps_data.select_dtypes(include=['object']).columns

# Impute numerical columns with median
ps_data[numerical_cols] = ps_data[numerical_cols].fillna(ps_data[numerical_cols].median())

# Impute categorical columns with mode
ps_data[categorical_cols] = ps_data[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

In [64]:
# Checking the Datatypes after Data Cleaning for Play Store Dataset
ps_data.dtypes

App                object
Category           object
Rating            float64
Reviews             int32
Size              float64
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
Log_Installs      float64
Log_Reviews       float64
Rating Group       object
Revenue           float64
dtype: object

In [65]:
# Merge both the Datasets
df = pd.merge(ps_data, reviews_df, on='App', how='inner')

# Visualize the Dataset
df.head(20)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,...,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating Group,Revenue,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,A kid's excessive ads. The types ads allowed a...,Negative,-0.25,1.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,It bad >:(,Negative,-0.725,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,like,Neutral,0.0,0.0
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,I love colors inspyering,Positive,0.5,0.6
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,I hate,Negative,-0.8,0.9
5,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,omgggggg,Neutral,0.0,0.0
6,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,It cute.,Positive,0.5,1.0
7,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,I love,Positive,0.5,0.6
8,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,Love,Positive,0.5,0.6
9,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,2.0.0,4.0.3 and up,13.122363,6.874198,Above Average App,0.0,I love enjoyable fun,Positive,0.433333,0.466667


In [66]:
# Sentiment Analysis
sia = SentimentIntensityAnalyzer()

In [67]:
# Sentiment Analysis Score
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
reviews_df.head(20)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369
6,10 Best Foods for You,Amazing,Positive,0.6,0.9,0.5859
8,10 Best Foods for You,"Looking forward app,",Neutral,0.0,0.0,0.0
9,10 Best Foods for You,It helpful site ! It help foods get !,Neutral,0.0,0.0,0.7256
10,10 Best Foods for You,good you.,Positive,0.7,0.6,0.4404
11,10 Best Foods for You,Useful information The amount spelling errors ...,Positive,0.2,0.1,0.4404


In [68]:
# Saving the Plotly Graphs as HTML Files

# Define the desired directory path
directory_path = r"C:\Users\Anirban Majumder\OneDrive - RICE Group\Desktop\Programming & Skills\GooglePlayStore_Analytics\HTML_Files"

# Check if the directory already exists
if not os.path.exists(directory_path):
    # Create the directory
    os.makedirs(directory_path)

# Save each Plotly figure as an HTML file
plot_container = ""

def save_plot_as_html (fig, filename, insight):
    global plot_container
    filepath = os.path.join(directory_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    
    # Append the plot and its insights to the 'plot_container'
    plot_container += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    
    fig.write_html (filepath, full_html=False, include_plotlyjs='inline')

In [69]:
# Plot General Information

plot_width=400
plot_height=300
plot_bg_color='black'
text_color='white'
title_font={'size':16}
axis_font={'size':12}

In [70]:
#Graph 1

category_counts = ps_data['Category'].value_counts().nlargest(10)

# Creating Bar Graph
fig1=px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x':'Category','y':'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=plot_width,
    height=plot_height
)

# Adjusting Layout for Better Visualization
fig1.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10,r=10,t=30,b=10)
)

# Show the Plot
fig1.show()

#Save the plot as an HTML file
save_plot_as_html(fig1,"Category Graph 1.html","The top categories on the Play Store are dominated by tools, entertainment, and productivity apps")

In [71]:
# Graph 2

type_counts = ps_data['Type'].value_counts()

# Create the Pie Chart
fig2=px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=plot_width,
    height=plot_height
)

# Adjust the Layout for Better Visualization
fig2.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    margin=dict(l=10,r=10,t=30,b=10)
)

# Show the plot
fig2.show()

# Save the plot as an HTML file
save_plot_as_html(fig2,"Type Graph 2.html","Most apps on the Playstore are free, indicating a strategy to attract users first and monetize through ads or in app purchases")

In [72]:
# Graph 3

# Creating a Histogram
fig3=px.histogram(
    ps_data,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=plot_width,
    height=plot_height
)

# Adjusting the Layout for a Better Visualization
fig3.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10,r=10,t=30,b=10)
)

# Display the plot
fig3.show()

# Save the plot as an HTML file
save_plot_as_html(fig3,"Rating Graph 3.html","Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users")

In [73]:
# Graph 4

sentiment_counts=reviews_df['Sentiment_Score'].value_counts()

# Creating a Bar Plot
fig4=px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x':'Sentiment Score','y':'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width=plot_width,
    height=plot_height,
)

# Adjusting the Layout for a Better Visualization
fig4.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10,r=10,t=30,b=10)
)

# Show the Plot
fig4.show()

# Save the Plot as an HTML file
save_plot_as_html(fig4,"Sentiment Graph 4.html","Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments")

In [74]:
# Graph 5

installs_by_category = ps_data.groupby('Category')['Installs'].sum().nlargest(10)

# Creating a Bar Chart
fig5 = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Installs by Category',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=plot_width,
    height=plot_height
)

# Adjusting the Layout for a Better Visualization
fig5.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)

# Display the Plot
fig5.show()

# Save the Plot as an HTML File
fig5.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig5, "installs_by_category.html", "The categories with the most installs are social and communication apps, which reflects their broad appeal and daily usage.")

In [75]:
# Graph 6

# Converting to DateTime Format
ps_data['Last Updated'] = pd.to_datetime(ps_data['Last Updated'], errors='coerce')
ps_data['Year'] = ps_data['Last Updated'].dt.year

updates_per_year = ps_data['Last Updated'].dt.year.value_counts().sort_index()

# Creating a Line Chart
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#AB63FA'],
    width=plot_width,
    height=plot_height
)

# Adjusting the Layout for a Better Visualization
fig6.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)

# Display the Plot
fig6.show()

# Save the Plot as an HTML File
save_plot_as_html(fig6, "updates_per_year.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

In [77]:
# Graph 7

revenue_by_category = ps_data.groupby('Category')['Revenue'].sum().nlargest(10)

# Creating a Bar Chart
fig7 = px.bar(
    x=revenue_by_category.index,
    y=revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Revenue'},
    title='Revenue by Category',
    color=revenue_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=plot_width,
    height=plot_height
)

# Adjusting the Layout for a Better Visualization
fig7.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)

# Show the Plot
fig7.show()

# Save the Plot as an HTML File
fig7.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig7, "revenue_by_category.html", "Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential.")

In [78]:
# Graph 8

genre_counts = ps_data['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)

# Creating a Bar Chart
fig8 = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Top Genres',
    color=genre_counts.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=plot_width,
    height=plot_height
)

# Adjusting the Layout for a Better Visualization
fig8.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)

# Show the Plot
fig8.show()

# Save the Plot as an HTML File
fig8.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig8, "genres_counts.html", "Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games.")

In [79]:
# Graph 9

# Creating a Scatter Plot
fig9 = px.scatter(
    ps_data,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=plot_width,
    height=plot_height
)

# Adjusting the Layout for a Better Visualization
fig9.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)

# Show the Plot
fig9.show()

# Save the Plot as an HTML File
save_plot_as_html(fig9, "update_on_rating.html", "The scatter plot shows a weak correlation between the last update date and ratings, suggesting that more frequent updates don't always result in better ratings.")

In [80]:
# Graph 10

# Creating a Box Plot
fig10 = px.box(
    ps_data,
    x='Type',
    y='Rating',
    color='Type',
    title='Ratings for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=plot_width,
    height=plot_height
)

# Adjust the Layout for a Better Visualization
fig10.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)

# Show the Plot
fig10.show()

# Save the Plot as an HTML File
save_plot_as_html(fig10, "ratings_paid_free.html", "Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for.")

In [81]:
ps_data.to_csv('ps_data.csv')

In [82]:
ps_data.dtypes

App                       object
Category                  object
Rating                   float64
Reviews                    int32
Size                     float64
Installs                   int32
Type                      object
Price                    float64
Content Rating            object
Genres                    object
Last Updated      datetime64[ns]
Current Ver               object
Android Ver               object
Log_Installs             float64
Log_Reviews              float64
Rating Group              object
Revenue                  float64
Year                       int32
dtype: object

In [83]:
ps_data.value_counts()

App                                               Category            Rating  Reviews  Size  Installs  Type  Price  Content Rating  Genres                       Last Updated  Current Ver  Android Ver   Log_Installs  Log_Reviews  Rating Group   Revenue  Year
+Download 4 Instagram Twitter                     SOCIAL              4.5     40467    22.0  1000000   Free  0.00   Everyone        Social                       2018-08-02    5.03         4.1 and up    13.815511     10.608242    Top Rated App  0.0      2018    1
Nightenfell: Shared AR                            GAME                4.5     20       65.0  1000      Paid  0.99   Everyone 10+    Action                       2018-03-27    1.0.0        7.0 and up    6.907755      2.995732     Top Rated App  990.0    2018    1
Nick                                              FAMILY              4.2     123322   25.0  10000000  Free  0.00   Everyone 10+    Entertainment;Music & Video  2018-01-24    2.0.8        4.4 and up    16.118096     

In [84]:
# Task 1:

"""
    Create a Scatter Plot to visualize the relationship between revenue and the number of installs for paid apps only.
    Add a trendline to show the correlation and the color code the points based on app categories.
"""

# Filter Paid Apps from the Dataset
paid_apps = df[df['Type'] == 'Paid'].copy()

# Calculate the Revenue generated
paid_apps['Revenue'] = paid_apps['Installs'] * paid_apps['Price']

# Fit a Trendline
if len(paid_apps) > 1:
    trend = np.polyfit(paid_apps['Installs'], paid_apps['Revenue'], 1)
    line = np.poly1d(trend)
    paid_apps['Trendline'] = line(paid_apps['Installs'])
else:
    paid_apps['Trendline'] = np.nan 

# Creating the Scatter Plot
figTask1 = px.scatter(
    paid_apps,
    x = 'Installs',
    y = 'Revenue',
    color = 'Category',
    title = 'Relationship between Revenue and Installs',
    labels = {'Installs' : 'Number of Installs', 'Revenue' : 'Revenue'},
    hover_data = ['App'],
    opacity = 0.7,
    color_discrete_sequence = px.colors.qualitative.Pastel
)

# Adding the Trendline
if 'Trendline' in paid_apps.columns and not paid_apps['Trendline'].isna().all():
    figTask1.add_scatter(
        x=paid_apps['Installs'],
        y=paid_apps['Trendline'],
        mode='lines',
        name='Trendline',
        line=dict(color='red', dash='dash')
    )

# Update Layout
figTask1.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    height = plot_height,
    width = plot_width,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)

# Show the Plot
figTask1.show()

# Save the Plot as an HTML File
save_plot_as_html(figTask1, "scatter & trendline.html", "Revenue generated by Apps of Sports category per installs is the highest and Personalization has the highest cumulative revenue as well as the highest number of installation.")

In [None]:
# Task 2:
"""
    Create a dual-axis chart comparing the average installs and revenue for free vs paid apps within the top 3 categories.
    Apply filters to exclude apps with fewer than 10,000 installs and revenue below $10,000 and 
        Android Version should be more than 4.0 as well as size should be more than 15M and
        content rating should be Everyone and app name should not have more than 30 characters including space and special characters.
    This graph should work only between 1PM IST to 2PM IST apart from that time we should not show the graph in the dashboard itself.
"""

# Load the dataset
file_path = "C:/Users/Anirban Majumder/OneDrive - RICE Group/Desktop/Programming & Skills/GooglePlayStore_Analytics/ps_data.csv"
df = pd.read_csv(file_path)

# Data Cleaning and Filtering
df = df.dropna(subset=['Installs', 'Revenue', 'Android Ver', 'Size', 'Content Rating', 'App', 'Category', 'Type'])

# Convert Installs & Revenue to numeric values
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')
df['Revenue'] = pd.to_numeric(df['Revenue'], errors='coerce')

# Filter based on conditions
df = df[(df['Installs'] >= 10000) & (df['Revenue'] >= 10000) & (df['Content Rating'] == 'Everyone') & (df['App'].str.len() <= 30)]

# Convert Android Version to numeric (extracting only major version)
df['Android Ver'] = df['Android Ver'].astype(str).str.extract(r'(\d+\.\d+)').astype(float)
df = df[df['Android Ver'] > 4.0]

# Convert Size to numeric (removing 'M' and converting to float)
df['Size'] = df['Size'].astype(str).str.replace('M', '', regex=True)
df['Size'] = pd.to_numeric(df['Size'], errors='coerce')
df = df[df['Size'] > 15]

# Get top 3 categories based on total installs
top_categories = df.groupby('Category')['Installs'].sum().nlargest(3).index
df_top = df[df['Category'].isin(top_categories)]

# Aggregate data
grouped_df = df_top.groupby(['Category', 'Type']).agg(avg_installs=('Installs', 'mean'), avg_revenue=('Revenue', 'mean')).reset_index()

# Get the current time in IST
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(ist).time()

# Define allowed time range (1 PM - 2 PM IST)
start_time = datetime.strptime("13:00:00", "%H:%M:%S").time()
end_time = datetime.strptime("14:00:00", "%H:%M:%S").time()

# Check if current time is within the allowed range
if start_time <= current_time <= end_time:
    
    # Create a Plotly Dual-Axis Chart
    fig2Task = go.Figure()

    # Bar Chart for Revenue
    for category in top_categories:
        cat_data = grouped_df[grouped_df["Category"] == category]
        fig2Task.add_trace(go.Bar(
            x=cat_data["Type"], 
            y=cat_data["avg_revenue"], 
            name=f'Average Revenue - {category}', 
            marker_color='blue', 
            yaxis='y1'
        ))

    # Line Chart for Install
    for category in top_categories:
        cat_data = grouped_df[grouped_df["Category"] == category]
        fig2Task.add_trace(go.Scatter(
            x=cat_data["Type"],
            y=cat_data["avg_installs"],
            name=f'Average Install - {category}',
            mode='lines+markers',
            marker=dict(size=8, symbol='circle', color='red'),
            yaxis='y2'
        ))

    # Layout Configuration
    fig2Task.update_layout(
        title="Installs & Revenue for Free vs. Paid Apps in Top 3 Categories",
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font_color=text_color,
        title_font=title_font,
        height = plot_height,
        width = plot_width,
        xaxis=dict(title_font=axis_font, title='App Type'),
        yaxis=dict(title_font=axis_font, title = 'Average Installs', side = 'left'),
        yaxis2=dict(title_font=axis_font, title = 'Average Revenue', side = 'right'),
        margin=dict(l=10, r=10, t=30, b=10)
        )

    # Show the figure
    fig2Task.show()
    
else:
    print("Graph not available at this time! It will be available only between 1 PM IST to 2 PM IST.")

# Save the Plot as an HTML File
save_plot_as_html(fig2Task, "Install.html", "Photography apps are the most installed.")

In [None]:
# Task 3:

"""
    Use a grouped bar chart to compare the average rating and total review count for the top 10 app categories by number of installs.
    Filter out any categories where the average rating is below 4.0 and size below 10M and last update should be Jan month.
    This graph should work only between 3PM IST to 5PM IST, apart from that time, we should not show this graph in dashboard itself.
"""

# Load the dataset
file_path = "C:/Users/Anirban Majumder/OneDrive - RICE Group/Desktop/Programming & Skills/GooglePlayStore_Analytics/ps_data.csv"
df = pd.read_csv(file_path)

# Data Cleaning and Filtering
df = df.dropna(subset=['Installs', 'Reviews', 'Rating', 'Size', 'Last Updated', 'Category'])

# Convert Installs, Reviews, and Rating to numeric
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Filter based on conditions
df = df[(df['Rating'] >= 4.0) & (df['Size'].astype(str).str.replace('M', '', regex=True).astype(float) > 10)]

# Convert 'Last Updated' to datetime and filter for January month
df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')
df = df[df['Last Updated'].dt.month == 1]

# Get top 10 categories by total installs
top_categories = df.groupby('Category')['Installs'].sum().nlargest(10).index
df_top = df[df['Category'].isin(top_categories)]

# Aggregate data
grouped_df = df_top.groupby('Category').agg(avg_rating=('Rating', 'mean'), total_reviews=('Reviews', 'sum')).reset_index()

# Get the current time in IST
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(ist).time()

# Define allowed time range (3 PM - 5 PM IST)
start_time = datetime.strptime("15:00:00", "%H:%M:%S").time()
end_time = datetime.strptime("17:00:00", "%H:%M:%S").time()

# Initialize fig3Task as None
fig3Task = None

# Check if current time is within the allowed range
if start_time <= current_time <= end_time:
    
    # Create a grouped bar chart using Plotly
    fig3Task = go.Figure()

    # Plot for Ratings
    fig3Task.add_trace(go.Bar(x=grouped_df["Category"], y=grouped_df["avg_rating"], name="Avg Rating", marker_color="blue", yaxis="y1"))

    # Plot for Reviews
    fig3Task.add_trace(go.Bar(x=grouped_df["Category"], y=grouped_df["total_reviews"], name="Total Reviews", marker_color="red", yaxis="y2"))

    # Layout Configuration
    fig3Task.update_layout(
        title="Average Rating & Total Reviews for Top 10 App Categories",
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font_color=text_color,
        title_font=title_font,
        height=plot_height,
        width=plot_width,
        xaxis=dict(title_font=axis_font, title='App Category'),
        yaxis=dict(title_font=axis_font, title='Average Rating', side='left'),
        yaxis2=dict(title_font=axis_font, title='Total Reviews', side='right', overlaying='y', showgrid=False),
        margin=dict(l=10, r=10, t=30, b=10)
    )

    # Show the figure
    fig3Task.show()

    # Save the Plot as an HTML File
    def save_plot_as_html(fig, filename, description):
        fig.write_html(filename)
        print(f"Graph saved as {filename}. {description}")

    save_plot_as_html(fig3Task, "average_rating_total_reviews.html", "Personalization category has the highest ratings while Family category has the highest reviews.")
    
else:
    print("Graph not available at this time! It will be available only between 3 PM IST to 5 PM IST.")

Graph saved as average_rating_total_reviews.html. Personalization category has the highest ratings while Family category has the highest reviews.


In [None]:
# Task 4:

"""
    Create an interactive Choropleth map using Plotly to visualize global installs by Category.
    Apply filters to show data for only the top 5 app categories and highlight category where the number of installs exceeds 1 million.
    The app category should not start with the characters “A,” “C,” “G,” or “S.”
    This graph should work only between 6 PM IST and 8 PM IST; apart from that time, we should not show it in the dashboard itself.
"""

# Load the dataset
file_path = "C:/Users/Anirban Majumder/OneDrive - RICE Group/Desktop/Programming & Skills/GooglePlayStore_Analytics/ps_data.csv"
df = pd.read_csv(file_path)

# Data Cleaning and Filtering
df = df.dropna(subset=['Category', 'Installs'])

# Convert Installs to numeric
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

# Filter categories that do not start with A, C, G, or S
df = df[~df["Category"].str.startswith(('A', 'C', 'G', 'S'))]

# Select top 5 categories based on total installs
top_categories = df.groupby('Category')['Installs'].sum().nlargest(5).index
df = df[df['Category'].isin(top_categories)]

# Highlight installs > 1 million
df["Highlight"] = df["Installs"].apply(lambda x: "Above 1M" if x > 1000000 else "Below 1M")

# Get current IST time
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(ist).time()

# Define allowed time between 6 PM and 8 PM IST
allowed_start = datetime.strptime("18:00", "%H:%M").time()
allowed_end = datetime.strptime("20:00", "%H:%M").time()

# Display map only if within the allowed time range
if allowed_start <= current_time <= allowed_end:
    fig4Task = px.treemap(
        df,
        path=["Category"],
        values="Installs",
        color="Highlight",
        hover_data=["Installs"],
        title="Top 5 App Categories by Global Installs (Filtered)"
    )
    fig4Task.show()
else:
    print("The Treemap is only available between 6 PM IST and 8 PM IST.")

# Save the Plot as an HTML File
    def save_plot_as_html(fig, filename, description):
        fig4Task.write_html(filename)
        print(f"Graph saved as {filename}. {description}")

    save_plot_as_html(fig4Task, "AppCategories.html", "Productivity category has the highest install.")

In [None]:
# Task 5:

"""
    Generate a heatmap to show the correlation matrix between installs, ratings, and review counts.
    Filter the data to include only apps that have been updated within the last year and have at least 100,000 installs and
        reviews count should be more than 1k and genres name should not be Starting with characters A , F , E , G , I , K .
    This graph should work only between 2 PM IST to 4 PM IST apart from that time we should not show this graph in dashboard itself.
"""

# Load dataset
file_path = "C:/Users/Anirban Majumder/OneDrive - RICE Group/Desktop/Programming & Skills/GooglePlayStore_Analytics/ps_data.csv"
df = pd.read_csv(file_path)

# Convert 'Last Updated' to datetime format
df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')

# Get the current date and filter for apps updated within the last year
one_year_ago = datetime.now() - timedelta(days=365)
df = df[df['Last Updated'] >= one_year_ago]

# Convert necessary columns to numeric
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Apply filters
filtered_df = df[(df['Installs'] >= 100000) & (df['Reviews'] > 1000) & (~df['Genres'].str.startswith(('A', 'F', 'E', 'G', 'I', 'K')))]

# Select relevant columns for correlation analysis
corr_df = filtered_df[['Installs', 'Rating', 'Reviews']].dropna()
correlation_matrix = corr_df.corr()

# Get current IST time
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(ist).time()
allowed_start = datetime.strptime("14:00", "%H:%M").time()
allowed_end = datetime.strptime("16:00", "%H:%M").time()

# Display heatmap only if within the allowed time range
if allowed_start <= current_time <= allowed_end:
    fig5Task = ff.create_annotated_heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns.tolist(),
        y=correlation_matrix.index.tolist(),
        colorscale='Viridis'
    )
    fig5Task.update_layout(title="Correlation Heatmap: Installs, Ratings, and Reviews")
    fig5Task.show()
else:
    print("The heatmap is only available between 2 PM IST and 4 PM IST.")

# Save the Plot as an HTML File
    def save_plot_as_html(fig, filename, description):
        fig5Task.write_html(filename)
        print(f"Graph saved as {filename}. {description}")

    save_plot_as_html(fig5Task, "Heatmap.html", "Productivity category has the highest install.")

In [None]:
# Task 6:

"""
    Create a violin plot to visualize the distribution of ratings for each app category,
        but only include categories with more than 50 apps and
        app name should contain letter “C” and exclude apps with fewer than 10 reviews and rating should be less 4.0.
    This graph should work only between 4 PM IST to 6 PM IST apart from that time we should not show this graph in dashboard itself.
"""

# Load dataset
file_path = "C:/Users/Anirban Majumder/OneDrive - RICE Group/Desktop/Programming & Skills/GooglePlayStore_Analytics/ps_data.csv"
df = pd.read_csv(file_path)

# Data Cleaning
# Convert 'Last Updated' to datetime format
df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')

# Convert necessary columns to numeric
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Apply filters
filtered_df = df[(df['Reviews'] >= 10) & (df['Rating'] < 4.0) & (df['App'].str.contains('C', na=False, case=False))]

# Get categories with more than 50 apps
category_counts = filtered_df['Category'].value_counts()
valid_categories = category_counts[category_counts > 50].index
filtered_df = filtered_df[filtered_df['Category'].isin(valid_categories)]

# Get current IST time
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(ist).time()
allowed_start = datetime.strptime("14:00", "%H:%M").time()
allowed_end = datetime.strptime("18:00", "%H:%M").time()

# Display violin plot only if within the allowed time range
if allowed_start <= current_time <= allowed_end:
    fig6Task = px.violin(
        filtered_df, 
        x='Category', 
        y='Rating', 
        box=True, 
        points="all",
        title="Distribution of Ratings by App Category",
        color='Category')
    fig6Task.show()
else:
    print("The violin plot is only available between 4 PM IST and 6 PM IST.")

# Save the Plot as an HTML File
    def save_plot_as_html(fig, filename, description):
        fig6Task.write_html(filename)
        print(f"Graph saved as {filename}. {description}")

    save_plot_as_html(fig6Task, "Ratings Distribution.html", "Games category has the highest ratings.")

In [None]:
# Task 7:

"""
    Plot a time series line chart to show the trend of total installs over time, segmented by app category.
    Highlight periods of significant growth by shading the areas under the curve where the increase in installs exceeds 20% month-over-month and
        content rating should be teen and app name should start with letter ‘E’ and installs should be more than 10k as well as 
        this graph should work only between 6 PM IST to 9 PM IST apart from that time we should not show this graph in dashboard itself.
"""

# Load dataset
file_path = "C:/Users/Anirban Majumder/OneDrive - RICE Group/Desktop/Programming & Skills/GooglePlayStore_Analytics/ps_data.csv"
df = pd.read_csv(file_path)

# Data Cleaning
# Convert 'Last Updated' to datetime format
df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')

# Convert necessary columns to numeric
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Apply filters
filtered_df = df[(df['Installs'] > 10000) & (df['Content Rating'].str.lower() == 'teen') & (df['App'].str.startswith('E', na=False))]

# Aggregate installs over time by category
df_grouped = filtered_df.groupby([df['Last Updated'].dt.to_period("M"), 'Category'])['Installs'].sum().reset_index()
df_grouped['Last Updated'] = df_grouped['Last Updated'].astype(str)

# Calculate month-over-month growth
df_grouped['Install Growth'] = df_grouped.groupby('Category')['Installs'].pct_change() * 100

# Get current IST time
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(ist).time()
allowed_start = datetime.strptime("18:00", "%H:%M").time()
allowed_end = datetime.strptime("21:00", "%H:%M").time()

# Display time series plot only if within the allowed time range
if allowed_start <= current_time <= allowed_end:
    fig7Task = px.line(
        df_grouped, 
        x='Last Updated', 
        y='Installs', 
        color='Category', 
        title="Total Installs Trend by Category",
        labels={'Installs': 'Total Installs', 'Last Updated': 'Month'})
    
    # Highlight significant growth
    df_grouped['Highlight'] = df_grouped['Install Growth'] > 20
    fig7Task.add_traces(
        px.area(df_grouped[df_grouped['Highlight']], x='Last Updated', y='Installs', color='Category').data
    )
    
    fig7Task.show()

else:
    print("The time series chart is only available between 6 PM IST and 9 PM IST.")

# Save the Plot as an HTML File
    def save_plot_as_html(fig, filename, description):
        fig7Task.write_html(filename)
        print(f"Graph saved as {filename}. {description}")

    save_plot_as_html(fig7Task, "Trends.html", "Games category has the highest installation.")

In [91]:
# Split plot_containers to handle the last plot properly

plot_container_split = plot_container.split('</div>')
if len(plot_container_split) > 1:
    final_plot = plot_container_split[-2] + '</div>'
else:
    final_plot = plot_container

In [92]:
# Dashboard HTML Template

dashboard_html= """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name=viewport" content="width=device-width,initial-scale-1.0">
    <title> Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify_content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container: hover .insights {{
            display: block;
        }}
        </style>
        <script>
            function openPlot(filename) {{
                window.open(filename, '_blank');
                }}
        </script>
    </head>
    <body>
        <div class= "header">
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
            <h1>Google Play Store Reviews Analytics</h1>
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
        </div>
        <div class="container">
            {plots}
        </div>
    </body>
    </html>
    """

In [93]:
# Formatting the HTML Content
final_html=dashboard_html.format(plots=plot_container,plot_width=plot_width,plot_height=plot_height)

# Creating the File Path
dashboard_path=os.path.join(directory_path, "web page.html")

# Creating HTML File
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

# Opening the Local HTML File using the Default Browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

True