In [1]:
#Analysis1
import tkinter as tk
from tkinter import ttk
from tkinter import messagebox
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk


In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aditi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# Step 1: Load the Dataset
apps_df = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')

In [4]:
# Step 2: Data Cleaning - Corrected Version
apps_df = apps_df.dropna(subset=['Rating'])

# Handle missing values more safely without chained assignment
for column in apps_df.columns:
    # Calculate mode once to avoid repeated calculations
    col_mode = apps_df[column].mode()
    if not col_mode.empty:  # Check if mode exists
        mode_value = col_mode[0]
        # Use direct assignment instead of in-place fillna
        apps_df[column] = apps_df[column].fillna(mode_value)
    else:
        # If no mode (all values NaN), fill with empty string or 0
        apps_df[column] = apps_df[column].fillna('' if apps_df[column].dtype == 'object' else 0)

# Remove duplicates and invalid ratings
apps_df = apps_df.drop_duplicates()
apps_df = apps_df[apps_df['Rating'] <= 5]

# Clean reviews data
reviews_df = reviews_df.dropna(subset=['Translated_Review'])
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6M,500+,Free,0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [5]:
# Merge datasets on 'App' and handle non-matching apps
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')
merged_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.250000,1.000000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725000,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.000000,0.000000
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.500000,0.600000
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I hate,Negative,-0.800000,0.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59119,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Nice broser slow browsing speed... make 8mbps ...,Positive,0.100000,0.492308
59120,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,The thing I found missing simple bookmark draw...,Positive,0.225000,0.426786
59121,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Great Relief unwanted pop ups showing up. What...,Positive,0.650000,0.625000
59122,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Hoped found new go-to; LOVE Firefox PC. Aside ...,Positive,0.345455,0.484848


In [6]:
#data transformation
apps_df['Installs'] = (
    apps_df['Installs']
    .astype(str)                          # Convert all values to strings (including NaN)
    .str.replace('[+,]', '', regex=True)  # Remove '+' and ','
    .replace({'': '0', 'nan': '0'})       # Replace empty strings and 'nan' (from NaN) with '0'
    .astype(int)                          # Convert to integer
)

# Convert 'Price' (handles NaN and invalid values)
apps_df['Price'] = (
    apps_df['Price']
    .astype(str)                          # Convert all values to strings
    .str.replace('$', '', regex=False)    # Remove '$'
    .replace({'': '0', 'nan': '0'})       # Replace empty strings and 'nan' with '0'
    .astype(float)                        # Convert to float
)

In [7]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [8]:
import numpy as np

def convert_size(size):
    if isinstance(size, str):  # Check if the value is a string
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024  # Convert KB to MB
        elif 'Varies with device' in size:
            return np.nan  # Handle missing sizes
    return np.nan  # Default for non-string or invalid entries

# Apply the function to the 'Size' column
apps_df['Size'] = apps_df['Size'].apply(convert_size)

In [9]:
apps_df['Size'] = apps_df['Size'].apply(convert_size)
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [10]:
print(apps_df['Reviews'].dtype)


object


In [11]:
# Convert to numeric, forcing errors to NaN (non-numeric values will be handled)
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')


In [12]:
import numpy as np

apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])



In [13]:
# Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

In [14]:
# Add Revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.210440,5.075174,Top rated,0.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122365,6.875232,Above average,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424949,11.379520,Top rated,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534,12.281389,Top rated,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512935,6.875232,Top rated,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.216606,2.079442,Top rated,0.0
10836,Sya9a Maroc - FR,FAMILY,4.5,38,,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517393,3.663562,Top rated,0.0
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.615121,1.609438,Top rated,0.0
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.908755,4.744932,Top rated,0.0


In [15]:
# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
reviews_df

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000,0.6369
5,10 Best Foods for You,Best way,Positive,1.000000,0.300000,0.6369
...,...,...,...,...,...,...
64222,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667,-0.6486
64223,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000,0.447222,0.7430
64226,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500,0.250000,-0.7269
64227,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.800000,1.000000,0.7783


In [16]:
# Extract year from 'Last Updated' and create 'Year' column
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [17]:
import matplotlib
matplotlib.use('Agg')  # Set backend before other imports
import tkinter as tk
from tkinter import ttk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Generate comprehensive sample data
np.random.seed(42)
num_apps = 100

data = {
    'Category': np.random.choice(['Game', 'Social', 'Tools', 'Business', 'Education', 'Finance'], num_apps),
    'Type': np.random.choice(['Free', 'Paid'], num_apps, p=[0.75, 0.25]),
    'Rating': np.round(np.random.normal(4.2, 0.5, num_apps).clip(1, 5), 1),
    'Installs': np.random.randint(1000, 10000000, num_apps),
    'Last Updated': pd.to_datetime('2020-01-01') + pd.to_timedelta(np.random.randint(0, 1000, num_apps), unit='D'),
    'Price': np.round(np.abs(np.random.normal(2.99, 2, num_apps)), 2),
    'Reviews': np.random.randint(10, 50000, num_apps),
    'Genres': np.random.choice(['Action;Adventure', 'Puzzle;Brain', 'Social;Communication', 
                              'Productivity;Tools', 'Finance;Business', 'Education;Learning'], num_apps)
}
apps_df = pd.DataFrame(data)
apps_df['Year'] = apps_df['Last Updated'].dt.year
apps_df['Log_Reviews'] = np.log10(apps_df['Reviews'])
apps_df['Log_Installs'] = np.log10(apps_df['Installs'])
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs'] * 0.3  # Estimated revenue

# Generate review sentiment data
reviews_data = {
    'Sentiment_Score': np.round(np.random.normal(0.2, 0.5, num_apps*10).clip(-1, 1), 2)
}
reviews_df = pd.DataFrame(reviews_data)

class AppDashboard(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Google Play Store Analysis Dashboard")
        self.geometry("1400x900")
        self.configure(bg='#f0f0f0')
        
        # Configure main window grid
        self.grid_columnconfigure(0, weight=1)
        self.grid_rowconfigure(0, weight=1)
        
        # Create main container with scrollbars
        container = ttk.Frame(self)
        container.grid(row=0, column=0, sticky="nsew")
        
        canvas = tk.Canvas(container, bg='#f0f0f0', highlightthickness=0)
        scrollbar = ttk.Scrollbar(container, orient="vertical", command=canvas.yview)
        scrollable_frame = ttk.Frame(canvas)
        
        scrollable_frame.bind(
            "<Configure>",
            lambda e: canvas.configure(
                scrollregion=canvas.bbox("all")
            )
        )
        
        canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
        canvas.configure(yscrollcommand=scrollbar.set)
        
        # Pack the canvas and scrollbar
        canvas.pack(side="left", fill="both", expand=True)
        scrollbar.pack(side="right", fill="y")
        
        # Configure grid for visualizations
        for i in range(3):
            scrollable_frame.grid_columnconfigure(i, weight=1, uniform="cols")
        
        # Create visualizations in 3x2 grid
        self.create_category_analysis(scrollable_frame, 0, 0)
        self.create_type_analysis(scrollable_frame, 0, 1)
        self.create_rating_analysis(scrollable_frame, 0, 2)
        self.create_installation_analysis(scrollable_frame, 1, 0)
        self.create_revenue_analysis(scrollable_frame, 1, 1)
        self.create_sentiment_analysis(scrollable_frame, 1, 2)
        self.create_ml_model(scrollable_frame, 2, 0, columnspan=3)

    def create_visualization_frame(self, parent, row, column, rowspan=1, columnspan=1):
        """Helper function to create consistent frames for visualizations"""
        frame = ttk.Frame(parent, padding=10, relief="groove", borderwidth=2)
        frame.grid(row=row, column=column, rowspan=rowspan, columnspan=columnspan, 
                  sticky="nsew", padx=5, pady=5)
        frame.grid_propagate(False)
        return frame

    def create_category_analysis(self, parent, row, column):
        frame = self.create_visualization_frame(parent, row, column)
        
        category_counts = apps_df['Category'].value_counts().nlargest(10)
        fig, ax = plt.subplots(figsize=(8, 5))
        
        sns.barplot(
            x=category_counts.values, 
            y=category_counts.index,
            hue=category_counts.index,  # Added hue parameter
            palette="Spectral",
            legend=False,  # Disabled legend
            ax=ax
        )
        ax.set_title('Top 10 App Categories', fontsize=12, pad=10)
        ax.set_xlabel('Number of Apps', fontsize=10)
        ax.set_ylabel('Category', fontsize=10)
        plt.tight_layout()
        
        canvas = FigureCanvasTkAgg(fig, master=frame)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        plt.close(fig)

    def create_type_analysis(self, parent, row, column):
        frame = self.create_visualization_frame(parent, row, column)
        
        type_counts = apps_df['Type'].value_counts()
        fig, ax = plt.subplots(figsize=(6, 4))
        
        ax.pie(
            type_counts,
            labels=type_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            colors=['#66b3ff', '#ff9999'],
            textprops={'fontsize': 10}
        )
        ax.set_title('Free vs Paid Apps', fontsize=12, pad=10)
        plt.tight_layout()
        
        canvas = FigureCanvasTkAgg(fig, master=frame)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        plt.close(fig)

    def create_rating_analysis(self, parent, row, column):
        frame = self.create_visualization_frame(parent, row, column)
        
        fig, ax = plt.subplots(figsize=(8, 4))
        
        sns.histplot(
            apps_df['Rating'],
            bins=20,
            kde=True,
            color='skyblue',
            ax=ax
        )
        ax.set_title('App Rating Distribution', fontsize=12, pad=10)
        ax.set_xlabel('Rating', fontsize=10)
        ax.set_ylabel('Count', fontsize=10)
        plt.tight_layout()
        
        canvas = FigureCanvasTkAgg(fig, master=frame)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        plt.close(fig)

    def create_installation_analysis(self, parent, row, column):
        frame = self.create_visualization_frame(parent, row, column)
        
        installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(5)
        fig, ax = plt.subplots(figsize=(8, 4))
        
        sns.barplot(
            x=installs_by_category.values,
            y=installs_by_category.index,
            hue=installs_by_category.index,  # Added hue parameter
            palette="viridis",
            legend=False,  # Disabled legend
            ax=ax
        )
        ax.set_title('Top 5 Categories by Installations', fontsize=12, pad=10)
        ax.set_xlabel('Total Installations', fontsize=10)
        ax.set_ylabel('Category', fontsize=10)
        plt.tight_layout()
        
        canvas = FigureCanvasTkAgg(fig, master=frame)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        plt.close(fig)

    def create_revenue_analysis(self, parent, row, column):
        frame = self.create_visualization_frame(parent, row, column)
        
        revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(5)
        fig, ax = plt.subplots(figsize=(8, 4))
        
        sns.barplot(
            x=revenue_by_category.values,
            y=revenue_by_category.index,
            hue=revenue_by_category.index,  # Added hue parameter
            palette="rocket",
            legend=False,  # Disabled legend
            ax=ax
        )
        ax.set_title('Top 5 Revenue Generating Categories', fontsize=12, pad=10)
        ax.set_xlabel('Estimated Revenue', fontsize=10)
        ax.set_ylabel('Category', fontsize=10)
        plt.tight_layout()
        
        canvas = FigureCanvasTkAgg(fig, master=frame)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        plt.close(fig)

    def create_sentiment_analysis(self, parent, row, column):
        frame = self.create_visualization_frame(parent, row, column)
        
        sentiment_counts = reviews_df['Sentiment_Score'].apply(
            lambda x: 'Positive' if x > 0.1 else ('Negative' if x < -0.1 else 'Neutral')
        ).value_counts()
        
        fig, ax = plt.subplots(figsize=(6, 4))
        
        ax.pie(
            sentiment_counts,
            labels=sentiment_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            colors=['#99ff99', '#ff9999', '#66b3ff'],
            textprops={'fontsize': 10}
        )
        ax.set_title('User Review Sentiment', fontsize=12, pad=10)
        plt.tight_layout()
        
        canvas = FigureCanvasTkAgg(fig, master=frame)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        plt.close(fig)

    def create_ml_model(self, parent, row, column, columnspan=1):
        frame = self.create_visualization_frame(parent, row, column, columnspan=columnspan)
        
        # Prepare data
        X = apps_df[['Log_Reviews', 'Log_Installs', 'Price']]
        y = apps_df['Rating']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train model
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Create plot
        fig, ax = plt.subplots(figsize=(10, 4))
        
        ax.scatter(y_test, y_pred, alpha=0.5)
        ax.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
        ax.set_title(f'Rating Prediction Model\nMSE: {mse:.2f}, R²: {r2:.2f}', fontsize=12, pad=10)
        ax.set_xlabel('Actual Rating', fontsize=10)
        ax.set_ylabel('Predicted Rating', fontsize=10)
        plt.tight_layout()
        
        canvas = FigureCanvasTkAgg(fig, master=frame)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        plt.close(fig)

if __name__ == "__main__":
    app = AppDashboard()
    app.mainloop()

In [18]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os

In [19]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aditi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
# Step 1: Load the Dataset
apps_df = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')

In [21]:
# Step 2: Data Cleaning - Corrected Version
apps_df = apps_df.dropna(subset=['Rating'])

# Handle missing values more safely without chained assignment
for column in apps_df.columns:
    # Calculate mode once to avoid repeated calculations
    col_mode = apps_df[column].mode()
    if not col_mode.empty:  # Check if mode exists
        mode_value = col_mode[0]
        # Use direct assignment instead of in-place fillna
        apps_df[column] = apps_df[column].fillna(mode_value)
    else:
        # If no mode (all values NaN), fill with empty string or 0
        apps_df[column] = apps_df[column].fillna('' if apps_df[column].dtype == 'object' else 0)

# Remove duplicates and invalid ratings
apps_df = apps_df.drop_duplicates()
apps_df = apps_df[apps_df['Rating'] <= 5]

# Clean reviews data
reviews_df = reviews_df.dropna(subset=['Translated_Review'])
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6M,500+,Free,0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [22]:
# Merge datasets on 'App' and handle non-matching apps
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')
merged_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.250000,1.000000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725000,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.000000,0.000000
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.500000,0.600000
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I hate,Negative,-0.800000,0.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59119,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Nice broser slow browsing speed... make 8mbps ...,Positive,0.100000,0.492308
59120,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,The thing I found missing simple bookmark draw...,Positive,0.225000,0.426786
59121,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Great Relief unwanted pop ups showing up. What...,Positive,0.650000,0.625000
59122,Firefox Focus: The privacy browser,COMMUNICATION,4.4,36981,4.0M,"1,000,000+",Free,0,Everyone,Communication,"July 6, 2018",5.2,5.0 and up,Hoped found new go-to; LOVE Firefox PC. Aside ...,Positive,0.345455,0.484848


In [23]:
#data transformation
apps_df['Installs'] = (
    apps_df['Installs']
    .astype(str)                          # Convert all values to strings (including NaN)
    .str.replace('[+,]', '', regex=True)  # Remove '+' and ','
    .replace({'': '0', 'nan': '0'})       # Replace empty strings and 'nan' (from NaN) with '0'
    .astype(int)                          # Convert to integer
)

# Convert 'Price' (handles NaN and invalid values)
apps_df['Price'] = (
    apps_df['Price']
    .astype(str)                          # Convert all values to strings
    .str.replace('$', '', regex=False)    # Remove '$'
    .replace({'': '0', 'nan': '0'})       # Replace empty strings and 'nan' with '0'
    .astype(float)                        # Convert to float
)

In [24]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [25]:
import numpy as np

def convert_size(size):
    if isinstance(size, str):  # Check if the value is a string
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024  # Convert KB to MB
        elif 'Varies with device' in size:
            return np.nan  # Handle missing sizes
    return np.nan  # Default for non-string or invalid entries

# Apply the function to the 'Size' column
apps_df['Size'] = apps_df['Size'].apply(convert_size)

In [26]:
apps_df['Size'] = apps_df['Size'].apply(convert_size)
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [27]:
# Convert to numeric, forcing errors to NaN (non-numeric values will be handled)
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
import numpy as np

apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])



In [28]:
# Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.210440,5.075174,Top rated
1,Coloring book moana,ART_AND_DESIGN,3.9,967,,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122365,6.875232,Above average
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424949,11.379520,Top rated
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534,12.281389,Top rated
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512935,6.875232,Top rated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.216606,2.079442,Top rated
10836,Sya9a Maroc - FR,FAMILY,4.5,38,,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517393,3.663562,Top rated
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.615121,1.609438,Top rated
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.908755,4.744932,Top rated


In [29]:
# Add Revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.210440,5.075174,Top rated,0.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122365,6.875232,Above average,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424949,11.379520,Top rated,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534,12.281389,Top rated,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512935,6.875232,Top rated,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.216606,2.079442,Top rated,0.0
10836,Sya9a Maroc - FR,FAMILY,4.5,38,,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517393,3.663562,Top rated,0.0
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.615121,1.609438,Top rated,0.0
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.908755,4.744932,Top rated,0.0


In [30]:
# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
reviews_df

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000,0.6369
5,10 Best Foods for You,Best way,Positive,1.000000,0.300000,0.6369
...,...,...,...,...,...,...
64222,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667,-0.6486
64223,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000,0.447222,0.7430
64226,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500,0.250000,-0.7269
64227,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.800000,1.000000,0.7783


In [31]:
# Extract year from 'Last Updated' and create 'Year' column
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.210440,5.075174,Top rated,0.0,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122365,6.875232,Above average,0.0,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424949,11.379520,Top rated,0.0,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281389,Top rated,0.0,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512935,6.875232,Top rated,0.0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.216606,2.079442,Top rated,0.0,2017
10836,Sya9a Maroc - FR,FAMILY,4.5,38,,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517393,3.663562,Top rated,0.0,2017
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,,100,Free,0.0,Everyone,Education,2018-07-06,1.0,4.1 and up,4.615121,1.609438,Top rated,0.0,2018
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,6.908755,4.744932,Top rated,0.0,2015


In [32]:
import plotly.express as px

# Define the path for your HTML files
html_files_path = "./"

# Make sure the directory exists
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

# Initialize plot_containers
plot_containers = ""

# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Define your plots
plot_width = 400
plot_height = 300
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

# Category Analysis Plot
category_counts = apps_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x': 'Category', 'y': 'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=plot_width,
    height=plot_height
)
fig1.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig1.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig1, "category_analysis.html", "The top categories on the Play Store are dominated by tools, entertainment, and productivity apps. This suggests users are looking for apps that either provide utility or offer leisure activities.")

# Type Analysis Plot
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=plot_width,
    height=plot_height
)
fig2.update_traces(textposition='inside', textinfo='percent+label')
fig2.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig2, "type_analysis.html", "Most apps on the Play Store are free, indicating a strategy to attract users first and monetize through ads or in-app purchases.")

# Rating Distribution Plot
fig3 = px.histogram(
    apps_df,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=plot_width,
    height=plot_height
)
fig3.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig3, "rating_distribution.html", "Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users.")

sentiment_counts = reviews_df['Sentiment_Score'].value_counts()
fig4 = px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x': 'Sentiment Score', 'y': 'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width=plot_width,
    height=plot_height
)
fig4.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig4.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig4, "sentiment_distribution.html", "Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments.")

# Installs by Category Plot
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Installs by Category',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=plot_width,
    height=plot_height
)
fig5.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig5.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig5, "installs_by_category.html", "The categories with the most installs are social and communication apps, which reflects their broad appeal and daily usage.")

# Updates Per Year Plot
updates_per_year = apps_df['Last Updated'].dt.year.value_counts().sort_index()
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#AB63FA'],
    width=plot_width,
    height=plot_height
)
fig6.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig6, "updates_per_year.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

# Revenue by Category Plot
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(
    x=revenue_by_category.index,
    y=revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Revenue'},
    title='Revenue by Category',
    color=revenue_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=plot_width,
    height=plot_height
)
fig7.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig7.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig7, "revenue_by_category.html", "Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential.")

# Genre Count Plot
genre_counts = apps_df['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)
fig8 = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Top Genres',
    color=genre_counts.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=plot_width,
    height=plot_height
)
fig8.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig8.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig8, "genres_counts.html", "Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games.")

# Impact of Last Update on Rating
fig9 = px.scatter(
    apps_df,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=plot_width,
    height=plot_height
)
fig9.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig9, "update_on_rating.html", "The scatter plot shows a weak correlation between the last update date and ratings, suggesting that more frequent updates don't always result in better ratings.")

# Ratings for Paid vs Free Apps
fig10 = px.box(
    apps_df,
    x='Type',
    y='Rating',
    color='Type',
    title='Ratings for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=plot_width,
    height=plot_height
)
fig10.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig10, "ratings_paid_free.html", "Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for.")

# Split plot_containers to handle the last plot properly
plot_containers_split = plot_containers.split('</div>')
if len(plot_containers_split) > 1:
    final_plot = plot_containers_split[-2] + '</div>'
else:
    final_plot = plot_containers  # Use plot_containers as default if splitting isn't sufficient

# HTML template for the dashboard
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Reviews Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0, 0, 0, 0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""

# Use these containers to fill in your dashboard HTML
final_html = dashboard_html.format(plots=plot_containers, plot_width=plot_width, plot_height=plot_height)

# Save the final dashboard to an HTML file
dashboard_path = os.path.join(html_files_path, "dashboard.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

True

In [33]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os

In [34]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aditi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [35]:
apps_df=pd.read_csv('Play Store Data.csv')
reviews_df=pd.read_csv('User Reviews.csv')

In [36]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [37]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [38]:
apps_df = apps_df.dropna(subset=['Rating'])

for column in apps_df.columns:
    apps_df[column] = apps_df[column].fillna(apps_df[column].mode()[0])

apps_df.drop_duplicates(inplace=True)

apps_df = apps_df[apps_df['Rating'] <= 5]

reviews_df.dropna(subset=['Translated_Review'], inplace=True)

In [39]:
# Convert 'Installs' (handles NaN and invalid values)
apps_df['Installs'] = (
    apps_df['Installs']
    .astype(str)                          # Convert all values to strings (including NaN)
    .str.replace('[+,]', '', regex=True)  # Remove '+' and ','
    .replace({'': '0', 'nan': '0'})       # Replace empty strings and 'nan' (from NaN) with '0'
    .astype(int)                          # Convert to integer
)

# Convert 'Price' (handles NaN and invalid values)
apps_df['Price'] = (
    apps_df['Price']
    .astype(str)                          # Convert all values to strings
    .str.replace('$', '', regex=False)    # Remove '$'
    .replace({'': '0', 'nan': '0'})       # Replace empty strings and 'nan' with '0'
    .astype(float)                        # Convert to float
)

In [40]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [41]:
merged_df=pd.merge(apps_df,reviews_df,on='App',how='inner')

In [42]:
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M',''))
    elif 'k' in size:
        return float(size.replace('k',''))/1024
    else:
        return np.nan
apps_df['Size']=apps_df['Size'].apply(convert_size)

In [43]:
#Lograrithmic
apps_df['Log_Installs']=np.log(apps_df['Installs'])
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.210340
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122363
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424948
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.214608
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517193
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.605170
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.907755


In [44]:
apps_df['Reviews']=apps_df['Reviews'].astype(int)
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.210340
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122363
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424948
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.214608
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517193
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.605170
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.907755


In [45]:
apps_df['Log_Reviews']=np.log(apps_df['Reviews'])
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.210340,5.068904
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122363,6.874198
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424948,11.379508
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534,12.281384
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512925,6.874198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.214608,1.945910
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517193,3.637586
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.605170,1.386294
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.907755,4.736198


In [46]:
def rating_group(rating):
    if rating >= 4:
        return 'Top rated app'
    elif rating >=3:
        return 'Above average'
    elif rating >=2:
        return 'Average'
    else:
        return 'Below Average'
apps_df['Rating_Group']=apps_df['Rating'].apply(rating_group)
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.210340,5.068904,Top rated app
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122363,6.874198,Above average
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424948,11.379508,Top rated app
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534,12.281384,Top rated app
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512925,6.874198,Top rated app
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.214608,1.945910,Top rated app
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517193,3.637586,Top rated app
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.605170,1.386294,Top rated app
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.907755,4.736198,Top rated app


In [47]:
#Revenue column
apps_df['Revenue']=apps_df['Price']*apps_df['Installs']
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,9.210340,5.068904,Top rated app,0.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,13.122363,6.874198,Above average,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,15.424948,11.379508,Top rated app,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,17.727534,12.281384,Top rated app,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,11.512925,6.874198,Top rated app,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up,6.214608,1.945910,Top rated app,0.0
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up,8.517193,3.637586,Top rated app,0.0
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up,4.605170,1.386294,Top rated app,0.0
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device,6.907755,4.736198,Top rated app,0.0


In [48]:
review = "This app is amazing! I love the new features."
sentiment_score= sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.42, 'pos': 0.58, 'compound': 0.8516}


In [49]:
review = "This app is very bad! I hate the new features."
sentiment_score= sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.535, 'neu': 0.465, 'pos': 0.0, 'compound': -0.8427}


In [50]:
review = "This app is okay."
sentiment_score= sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'compound': 0.2263}


In [51]:
reviews_df['Sentiment_Score']=reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
reviews_df

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.000000,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.250000,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.400000,0.875000,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.000000,0.300000,0.6369
5,10 Best Foods for You,Best way,Positive,1.000000,0.300000,0.6369
...,...,...,...,...,...,...
64222,Housing-Real Estate & Property,Most ads older many agents ..not much owner po...,Positive,0.173333,0.486667,-0.6486
64223,Housing-Real Estate & Property,"If photos posted portal load, fit purpose. I'm...",Positive,0.225000,0.447222,0.7430
64226,Housing-Real Estate & Property,"Dumb app, I wanted post property rent give opt...",Negative,-0.287500,0.250000,-0.7269
64227,Housing-Real Estate & Property,I property business got link SMS happy perform...,Positive,0.800000,1.000000,0.7783


In [52]:
apps_df['Last Updated']=pd.to_datetime(apps_df['Last Updated'],errors='coerce')
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.210340,5.068904,Top rated app,0.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,Above average,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424948,11.379508,Top rated app,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281384,Top rated app,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512925,6.874198,Top rated app,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.214608,1.945910,Top rated app,0.0
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517193,3.637586,Top rated app,0.0
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,2018-07-06,1.0,4.1 and up,4.605170,1.386294,Top rated app,0.0
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,6.907755,4.736198,Top rated app,0.0


In [53]:
apps_df['Year']=apps_df['Last Updated'].dt.year
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.210340,5.068904,Top rated app,0.0,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,Above average,0.0,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424948,11.379508,Top rated app,0.0,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281384,Top rated app,0.0,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512925,6.874198,Top rated app,0.0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6,500,Free,0.0,Everyone,Education,2017-06-18,1.0.0,4.1 and up,6.214608,1.945910,Top rated app,0.0,2017
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53.0,5000,Free,0.0,Everyone,Education,2017-07-25,1.48,4.1 and up,8.517193,3.637586,Top rated app,0.0,2017
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,Free,0.0,Everyone,Education,2018-07-06,1.0,4.1 and up,4.605170,1.386294,Top rated app,0.0,2018
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,,1000,Free,0.0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,6.907755,4.736198,Top rated app,0.0,2015


In [54]:

import os

# Then create directory for HTML files
html_files_path = "./visualizations"  # Better to use a specific folder name
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)
    print(f"Created directory: {html_files_path}")
else:
    print(f"Directory already exists: {html_files_path}")

Directory already exists: ./visualizations


In [55]:
# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

In [56]:
plot_width=400
plot_height=300
plot_bg_color='black'
text_color='white'
title_font={'size':16}
axis_font={'size':12}

In [57]:
# First ensure you have these imports at the top of your notebook
import pandas as pd
import plotly.express as px
import os

# 1. Load the dataset (add error handling)
try:
    apps_df = pd.read_csv('Play Store Data.csv')
    print("Dataset loaded successfully with", len(apps_df), "rows")
except FileNotFoundError:
    print("Error: 'Play Store Data.csv' not found in current directory")
    print("Current directory:", os.getcwd())
    # Create empty DataFrame to prevent further errors
    apps_df = pd.DataFrame()

# Only proceed if data loaded successfully
if not apps_df.empty:
    # 2. Data cleaning - ensure Category column exists
    if 'Category' in apps_df.columns:
        # Figure 1 - Top Categories
        category_counts = apps_df['Category'].value_counts().nlargest(10)
        
        fig1 = px.bar(
            x=category_counts.index,
            y=category_counts.values,
            labels={'x':'Category','y':'Count'},
            title='Top Categories on Play Store',
            color=category_counts.index,
            color_discrete_sequence=px.colors.sequential.Plasma,
            width=400,
            height=300
        )
        
        fig1.update_layout(
            plot_bgcolor='black',
            paper_bgcolor='black',
            font_color='white',
            title_font={'size':16},
            xaxis=dict(
                title_font={'size':12},
                title='App Category',
                categoryorder='total descending'  # Sort by count
            ),
            yaxis=dict(
                title_font={'size':12},
                title='Number of Apps'
            ),
            margin=dict(l=10,r=10,t=30,b=10),
            showlegend=False  # Colors already shown on axis
        )
        
        # Save the figure
        save_plot_as_html(fig1, "Category_Distribution.html",
                         "Top categories: " + ", ".join(category_counts.index[:3]) + 
                         f" account for {category_counts[:3].sum()/len(apps_df):.1%} of all apps")
    else:
        print("Error: 'Category' column not found in the dataset")
else:
    print("Cannot create visualization - no data loaded")

Dataset loaded successfully with 10841 rows


In [58]:
#Figure 2
type_counts=apps_df['Type'].value_counts()
fig2=px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=400,
    height=300
)
fig2.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    margin=dict(l=10,r=10,t=30,b=10)
)
#fig1.update_traces(marker=dict(pattern=dict(line=dict(color='white',width=1))))
save_plot_as_html(fig2,"Type Graph 2.html","Most apps on the Playstore are free, indicating a strategy to attract users first and monetize through ads or in app purchases")

In [59]:
#Figure 3
fig3=px.histogram(
    apps_df,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=400,
    height=300
)
fig3.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
#fig1.update_traces(marker=dict(pattern=dict(line=dict(color='white',width=1))))
save_plot_as_html(fig3,"Rating Graph 3.html","Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users")

In [60]:
# Import required libraries
import pandas as pd
import plotly.express as px
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import os

# Download VADER lexicon if not already present
nltk.download('vader_lexicon')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Load user reviews data
try:
    reviews_df = pd.read_csv('User Reviews.csv')
    print(f"User reviews loaded successfully with {len(reviews_df)} rows")
    
    # Calculate sentiment scores if 'Sentiment_Score' doesn't exist
    if 'Sentiment_Score' not in reviews_df.columns:
        print("Calculating sentiment scores from review text...")
        
        # Ensure we have the text column (adjust name if different)
        text_column = 'Translated_Review' if 'Translated_Review' in reviews_df.columns else 'Review'
        
        if text_column in reviews_df.columns:
            # Calculate sentiment scores
            reviews_df['Sentiment_Score'] = reviews_df[text_column].apply(
                lambda x: sia.polarity_scores(str(x))['compound']
            )
            print("Successfully calculated sentiment scores")
        else:
            print(f"Error: Neither 'Translated_Review' nor 'Review' column found")
            reviews_df['Sentiment_Score'] = 0  # Create dummy column
            
except Exception as e:
    print(f"Error loading reviews: {str(e)}")
    reviews_df = pd.DataFrame()

# Proceed with visualization if we have data
if not reviews_df.empty and 'Sentiment_Score' in reviews_df.columns:
    # Create sentiment categories
    bins = [-1, -0.5, -0.1, 0.1, 0.5, 1]
    labels = ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']
    
    reviews_df['Sentiment_Category'] = pd.cut(
        reviews_df['Sentiment_Score'],
        bins=bins,
        labels=labels
    )
    
    # Get counts for each category
    sentiment_counts = reviews_df['Sentiment_Category'].value_counts().reindex(labels)
    
    # Create visualization
    fig = px.bar(
        x=sentiment_counts.index,
        y=sentiment_counts.values,
        labels={'x': 'Sentiment', 'y': 'Number of Reviews'},
        title='Distribution of Review Sentiments',
        color=sentiment_counts.index,
        color_discrete_sequence=px.colors.sequential.RdPu,
        category_orders={"x": labels}
    )
    
    # Calculate percentages
    total = len(reviews_df)
    percentages = (sentiment_counts / total * 100).round(1)
    
    # Update layout
    fig.update_layout(
        plot_bgcolor='black',
        paper_bgcolor='black',
        font_color='white',
        xaxis_title="Sentiment Category",
        yaxis_title="Number of Reviews",
        showlegend=False
    )
    
    # Create insight message
    insight = (
        f"Sentiment distribution: {percentages['Positive']}% Positive, "
        f"{percentages['Very Positive']}% Very Positive, "
        f"{percentages['Negative']}% Negative, "
        f"{percentages['Very Negative']}% Very Negative"
    )
    
    # Save visualization
    save_plot_as_html(fig, "review_sentiment_analysis.html", insight)
    
    print("Successfully created sentiment visualization")
else:
    print("Could not create visualization - missing required data")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aditi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


User reviews loaded successfully with 64295 rows
Calculating sentiment scores from review text...
Successfully calculated sentiment scores
Successfully created sentiment visualization


In [61]:
# First ensure these imports are at the top of your notebook
import pandas as pd
import plotly.express as px

def clean_installs(value):
    """Convert installs string to numeric value, handling special cases"""
    if pd.isna(value):
        return 0
    if isinstance(value, str):
        if value == 'Free':
            return 0
        # Remove + and commas, then convert to int
        return int(value.replace('+', '').replace(',', ''))
    return int(value)

# Convert Installs column safely
if 'Installs' in apps_df.columns:
    apps_df['Installs'] = apps_df['Installs'].apply(clean_installs)
    
    # Now create the visualization
    installs_by_category = (
        apps_df.groupby('Category')['Installs']
        .sum()
        .nlargest(10)
        .sort_values(ascending=True)  # For better horizontal bar display
    )
    
    # Convert to DataFrame for safer access
    installs_df = installs_by_category.reset_index()
    installs_df.columns = ['Category', 'Installs']
    
    # Format numbers with commas for display
    formatted_installs = installs_df['Installs'].apply(lambda x: f"{x:,}")
    
    fig5 = px.bar(
        x=installs_df['Installs'],
        y=installs_df['Category'],
        orientation='h',
        labels={'x': 'Total Installs', 'y': 'Category'},
        title='Top Categories by Total Installs',
        color=installs_df['Category'],
        color_discrete_sequence=px.colors.sequential.Blues,
        width=400,
        height=300,
        hover_data={'Total Installs': formatted_installs}
    )
    
    fig5.update_layout(
        plot_bgcolor='black',
        paper_bgcolor='black',
        font_color='white',
        title_font={'size': 16},
        xaxis=dict(
            title_font={'size': 12},
            title='Total Installs',
            tickformat=',.0f'  # Format with commas
        ),
        yaxis=dict(
            title_font={'size': 12},
            title='App Category',
            categoryorder='total ascending'  # Sort bars by value
        ),
        margin=dict(l=10, r=10, t=30, b=10),
        showlegend=False
    )
    
    # Calculate percentage of total installs
    total_installs = apps_df['Installs'].sum()
    top_percentage = (installs_df['Installs'].sum() / total_installs * 100).round(1)
    
    # Get top categories using iloc for position-based access
    top1_cat = installs_df.iloc[-1, 0]
    top1_inst = formatted_installs.iloc[-1]
    top2_cat = installs_df.iloc[-2, 0]
    top2_inst = formatted_installs.iloc[-2]
    
    save_plot_as_html(
        fig5, 
        "Installs_By_Category.html",
        f"Top categories account for {top_percentage}% of all installs. "
        f"Leading: {top1_cat} ({top1_inst}), "
        f"{top2_cat} ({top2_inst})"
    )
else:
    print("Error: 'Installs' column not found in dataset")

In [62]:
# First ensure these imports are at the top of your notebook
import pandas as pd
import plotly.express as px

# Convert 'Last Updated' to datetime format
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')

# Remove rows with invalid dates
apps_df = apps_df.dropna(subset=['Last Updated'])

# Count updates per year
updates_per_year = apps_df['Last Updated'].dt.year.value_counts().sort_index()

# Create the line plot
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='App Update Frequency Over Time',
    color_discrete_sequence=['#AB63FA'],
    width=400,
    height=300
)

# Calculate year-over-year change if we have enough data
if len(updates_per_year) > 1:
    latest_year = updates_per_year.index[-1]
    prev_year = updates_per_year.index[-2]
    pct_change = ((updates_per_year[latest_year] - updates_per_year[prev_year]) / 
                 updates_per_year[prev_year] * 100)  # Fixed parentheses here
    insight = (f"Update frequency changed by {pct_change:.1f}% from {prev_year} to {latest_year}. "
              f"Latest year ({latest_year}): {updates_per_year[latest_year]:,} updates")
else:
    insight = f"Total updates tracked: {updates_per_year.sum():,} across {len(updates_per_year)} years"

# Style the plot
fig6.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(
        title_font={'size': 12},
        title='Year',
        tickmode='linear'
    ),
    yaxis=dict(
        title_font={'size': 12},
        title='Number of Updates'
    ),
    margin=dict(l=10, r=10, t=30, b=10)
)

# Add markers to the line
fig6.update_traces(
    mode='lines+markers',
    marker=dict(size=8, line=dict(width=1, color='white'))
)

save_plot_as_html(fig6, "App_Updates_Over_Time.html", insight)

In [63]:
# First ensure these imports are at the top of your notebook
import pandas as pd
import plotly.express as px

# Calculate Revenue if not already done (Price * Installs)
if 'Price' in apps_df.columns and 'Installs' in apps_df.columns:
    # Convert Price to numeric (handling $ and commas with raw strings)
    apps_df['Price'] = apps_df['Price'].replace(r'[\$,]', '', regex=True).astype(float)
    
    # Convert Installs to numeric (handling + and commas with raw strings)
    apps_df['Installs'] = apps_df['Installs'].replace(r'[+,]', '', regex=True).astype(int)
    
    # Calculate Revenue
    apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']
    
    # Now create the visualization
    revenue_by_category = (
        apps_df.groupby('Category')['Revenue']
        .sum()
        .nlargest(10)
        .sort_values(ascending=False)
    )
    
    # Format numbers for display
    formatted_revenue = revenue_by_category.apply(lambda x: f"${x:,.0f}")
    
    fig7 = px.bar(
        x=revenue_by_category.index,
        y=revenue_by_category.values,
        labels={'x':'Category', 'y':'Revenue (USD)'},
        title='Top Revenue Generating Categories',
        color=revenue_by_category.index,
        color_discrete_sequence=px.colors.sequential.Greens,
        width=400,
        height=300,
        hover_data={'Formatted Revenue': formatted_revenue}
    )
    
    fig7.update_layout(
        plot_bgcolor='black',
        paper_bgcolor='black',
        font_color='white',
        title_font={'size':16},
        xaxis=dict(
            title_font={'size':12},
            title='App Category',
            categoryorder='total descending'
        ),
        yaxis=dict(
            title_font={'size':12},
            title='Total Revenue',
            tickprefix='$'
        ),
        margin=dict(l=10, r=10, t=30, b=10),
        showlegend=False
    )
    
    # Format y-axis as currency
    fig7.update_yaxes(tickformat="$,.0f")
    
    # Calculate percentage for insight
    total_revenue = apps_df['Revenue'].sum()
    top3_percentage = revenue_by_category.head(3).sum() / total_revenue * 100
    
    save_plot_as_html(
        fig7, 
        "Revenue_By_Category.html",
        f"Top revenue categories: {revenue_by_category.index[0]} (${revenue_by_category.iloc[0]:,.0f}), "
        f"{revenue_by_category.index[1]} (${revenue_by_category.iloc[1]:,.0f}). "
        f"Top 3 categories account for {top3_percentage:.1f}% of total revenue."
    )
else:
    print("Error: Required columns ('Price' and/or 'Installs') not found in dataset")

In [64]:
#Figure 8
genre_counts=apps_df['Genres'].str.split(';',expand=True).stack().value_counts().nlargest(10)
fig8=px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x':'Genre','y':'Count'},
    title='Top Genres',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=400,
    height=300
)
fig8.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
#fig1.update_traces(marker=dict(pattern=dict(line=dict(color='white',width=1))))
save_plot_as_html(fig8,"Genre Graph 8.html","Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games")

In [65]:
# First ensure these imports are at the top of your notebook
import pandas as pd
import plotly.express as px

# Convert 'Last Updated' to datetime with error handling
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')

# Remove rows with invalid dates
apps_df = apps_df.dropna(subset=['Last Updated'])

# Create the scatter plot with improved settings
fig9 = px.scatter(
    apps_df,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='App Ratings Over Time by Type',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=400,
    height=300,
    hover_data=['App', 'Category', 'Installs'],  # Show more info on hover
    labels={
        'Last Updated': 'Update Date',
        'Rating': 'User Rating',
        'Type': 'App Type'
    }
)

# Update layout with comprehensive styling
fig9.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(
        title_font={'size': 12},
        title='Last Update Date',
        rangeslider=dict(visible=True),
        tickformat='%Y-%m'  # Format dates nicely
    ),
    yaxis=dict(
        title_font={'size': 12},
        title='User Rating',
        range=[0, 5.5]  # Ensure full rating scale is visible
    ),
    margin=dict(l=10, r=10, t=30, b=10),
    legend=dict(
        title_font={'size': 12},
        title='App Type',
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    )
)

# Improve marker visibility with complete parameters
fig9.update_traces(
    marker=dict(
        size=8,
        opacity=0.7,
        line=dict(
            width=1,
            color='white'
        )
    ),  # This comma was missing
    selector=dict(mode='markers')
)

# Calculate some stats for the insight
if 'Type' in apps_df.columns:
    rating_stats = apps_df.groupby('Type')['Rating'].mean().round(1)
    insight = (f"Average ratings: Paid apps {rating_stats.get('Paid', 'N/A')}, "
              f"Free apps {rating_stats.get('Free', 'N/A')}. Points show individual app ratings over time.")
else:
    insight = "Visualization shows app ratings trend over time by app type."

save_plot_as_html(fig9, "Ratings_Over_Time.html", insight)

In [66]:
#Figure 10
fig10=px.box(
    apps_df,
    x='Type',
    y='Rating',
    color='Type',
    title='Rating for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=400,
    height=300
)
fig10.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
#fig1.update_traces(marker=dict(pattern=dict(line=dict(color='white',width=1))))
save_plot_as_html(fig10,"Paid Free Graph 10.html","Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for")

In [67]:
plot_containers_split=plot_containers.split('</div>')

In [68]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

In [69]:
dashboard_html= """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name=viewport" content="width=device-width,initial-scale-1.0">
    <title> Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify_content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container: hover .insights {{
            display: block;
        }}
        </style>
        <script>
            function openPlot(filename) {{
                window.open(filename, '_blank');
                }}
        </script>
    </head>
    <body>
        <div class= "header">
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
            <h1>Google Play Store Reviews Analytics</h1>
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
        </div>
        <div class="container">
            {plots}
        </div>
    </body>
    </html>
    """


In [70]:
final_html=dashboard_html.format(plots=plot_containers,plot_width=plot_width,plot_height=plot_height)

In [71]:
dashboard_path=os.path.join(html_files_path,"web page.html")

In [72]:
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [73]:
webbrowser.open('file://'+os.path.realpath(dashboard_path))

True

In [74]:
#ASSIGNMENTS/TASKS

In [75]:
import pandas as pd
playstore_df = pd.read_csv('Play Store Data.csv')
reviews_df = pd.read_csv('User Reviews.csv')


In [76]:
# Convert Reviews column to numeric (if it's not already)
playstore_df['Reviews'] = pd.to_numeric(playstore_df['Reviews'], errors='coerce')
playstore_df

# Filter only apps with >1000 reviews
filtered_apps = playstore_df[playstore_df['Reviews'] > 1000]
filtered_apps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
7,Infinite Painter,ART_AND_DESIGN,4.1,36815.0,29M,"1,000,000+",Free,0,Everyone,Art & Design,"June 14, 2018",6.1.61.1,4.2 and up
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791.0,33M,"1,000,000+",Free,0,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up
10,Text on Photo - Fonteee,ART_AND_DESIGN,4.4,13880.0,28M,"1,000,000+",Free,0,Everyone,Art & Design,"October 27, 2017",1.0.4,4.1 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10809,Castle Clash: RPG War and Strategy FR,FAMILY,4.7,376223.0,24M,"1,000,000+",Free,0,Everyone,Strategy,"July 18, 2018",1.4.2,4.1 and up
10815,Golden Dictionary (FR-AR),BOOKS_AND_REFERENCE,4.2,5775.0,4.9M,"500,000+",Free,0,Everyone,Books & Reference,"July 19, 2018",7.0.4.6,4.2 and up
10826,Frim: get new friends on local chat rooms,SOCIAL,4.0,88486.0,Varies with device,"5,000,000+",Free,0,Mature 17+,Social,"March 23, 2018",Varies with device,Varies with device
10832,FR Tides,WEATHER,3.8,1195.0,582k,"100,000+",Free,0,Everyone,Weather,"February 16, 2014",6.0,2.1 and up


In [77]:
# Convert Reviews column to numeric (if it's not already)
playstore_df['Reviews'] = pd.to_numeric(playstore_df['Reviews'], errors='coerce')

# Filter only apps with >1000 reviews
filtered_apps = playstore_df[playstore_df['Reviews'] > 1000]

In [78]:
# Get top 5 categories by count
top_categories = filtered_apps['Category'].value_counts().nlargest(5).index.tolist()

# Filter for top 5 categories only
top_apps = filtered_apps[filtered_apps['Category'].isin(top_categories)]
top_apps

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1653,ROBLOX,GAME,4.5,4447388.0,67M,"100,000,000+",Free,0,Everyone 10+,Adventure;Action & Adventure,"July 31, 2018",2.347.225742,4.1 and up
1654,Subway Surfers,GAME,4.5,27722264.0,76M,"1,000,000,000+",Free,0,Everyone 10+,Arcade,"July 12, 2018",1.90.0,4.1 and up
1655,Candy Crush Saga,GAME,4.4,22426677.0,74M,"500,000,000+",Free,0,Everyone,Casual,"July 5, 2018",1.129.0.2,4.1 and up
1656,Solitaire,GAME,4.7,254258.0,23M,"10,000,000+",Free,0,Everyone,Card,"August 1, 2018",2.137.0,4.1 and up
1657,Bubble Shooter,GAME,4.5,148897.0,46M,"10,000,000+",Free,0,Everyone,Casual,"July 17, 2018",1.20.1,4.0.3 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10795,Reindeer VPN - Proxy VPN,TOOLS,4.2,7339.0,4.0M,"100,000+",Free,0,Everyone,Tools,"May 10, 2018",1.74,4.1 and up
10796,Inf VPN - Global Proxy & Unlimited Free WIFI VPN,TOOLS,4.7,61445.0,7.8M,"1,000,000+",Free,0,Everyone,Tools,"July 26, 2018",1.9.734,4.1 and up
10803,Fatal Raid - No.1 Mobile FPS,GAME,4.3,56496.0,81M,"1,000,000+",Free,0,Teen,Action,"August 7, 2018",1.5.447,4.0 and up
10804,Poker Pro.Fr,GAME,4.2,5442.0,17M,"100,000+",Free,0,Teen,Card,"May 22, 2018",4.1.3,2.3 and up


In [79]:
merged_df = pd.merge(reviews_df, top_apps[['App', 'Category', 'Rating']], on='App')

In [80]:
# Define buckets for rating groups
def rating_bucket(rating):
    if rating <= 2:
        return '1-2 Stars'
    elif rating <= 4:
        return '3-4 Stars'
    else:
        return '4-5 Stars'

merged_df['Rating Group'] = merged_df['Rating'].apply(rating_bucket)
merged_df

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Category,Rating,Rating Group
0,1LINE – One Line with One Touch,"gets 1* there's ad every single level restart,...",Negative,-0.157143,0.704762,GAME,4.6,4-5 Stars
1,1LINE – One Line with One Touch,"The game interesting challenging. However, oft...",Positive,0.200000,0.633333,GAME,4.6,4-5 Stars
2,1LINE – One Line with One Touch,,,,,GAME,4.6,4-5 Stars
3,1LINE – One Line with One Touch,"Wow ,i play 15 puzzle 1st round I remember puz...",Positive,0.133333,0.716667,GAME,4.6,4-5 Stars
4,1LINE – One Line with One Touch,Love game deleting I've waiting 3 months level...,Positive,0.059091,0.363636,GAME,4.6,4-5 Stars
...,...,...,...,...,...,...,...,...
52527,Hotspot Shield Free VPN Proxy & Wi-Fi Security,really great,Positive,0.800000,0.750000,TOOLS,4.2,4-5 Stars
52528,Hotspot Shield Free VPN Proxy & Wi-Fi Security,Good,Positive,0.700000,0.600000,TOOLS,4.2,4-5 Stars
52529,Hotspot Shield Free VPN Proxy & Wi-Fi Security,Good,Positive,0.700000,0.600000,TOOLS,4.2,4-5 Stars
52530,Hotspot Shield Free VPN Proxy & Wi-Fi Security,Good,Positive,0.700000,0.600000,TOOLS,4.2,4-5 Stars


In [81]:
# Drop rows with missing sentiment or rating group
sentiment_counts = merged_df.dropna(subset=['Sentiment', 'Rating Group'])

# Group by Category, Rating Group and Sentiment
grouped = sentiment_counts.groupby(['Category', 'Rating Group', 'Sentiment']).size().reset_index(name='Count')

# Pivot the table to get Sentiment values as columns
pivot_df = grouped.pivot_table(index=['Category', 'Rating Group'],
                                columns='Sentiment',
                                values='Count',
                                fill_value=0).reset_index()
pivot_df

Sentiment,Category,Rating Group,Negative,Neutral,Positive
0,FAMILY,3-4 Stars,120.0,67.0,196.0
1,FAMILY,4-5 Stars,1425.0,385.0,3634.0
2,GAME,3-4 Stars,78.0,14.0,124.0
3,GAME,4-5 Stars,7174.0,791.0,10860.0
4,PHOTOGRAPHY,3-4 Stars,41.0,17.0,80.0
5,PHOTOGRAPHY,4-5 Stars,444.0,270.0,1629.0
6,PRODUCTIVITY,3-4 Stars,21.0,14.0,40.0
7,PRODUCTIVITY,4-5 Stars,572.0,361.0,2090.0
8,TOOLS,3-4 Stars,58.0,76.0,137.0
9,TOOLS,4-5 Stars,289.0,281.0,953.0


In [82]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'DejaVu Sans'  # or 'Arial', 'Calibri', etc.

# Plot for each category
categories = pivot_df['Category'].unique()

for category in categories:
    cat_df = pivot_df[pivot_df['Category'] == category]
    cat_df = cat_df.set_index('Rating Group')[['Positive', 'Neutral', 'Negative']]

    cat_df.plot(kind='bar', stacked=True, figsize=(8, 5), title=f'Sentiment Distribution for {category}')
    plt.xlabel('Rating Group')
    plt.ylabel('Number of Reviews')
    plt.xticks(rotation=0)
    plt.legend(title='Sentiment')
    plt.tight_layout()
    plt.show()


FigureCanvasAgg is non-interactive, and thus cannot be shown


FigureCanvasAgg is non-interactive, and thus cannot be shown


FigureCanvasAgg is non-interactive, and thus cannot be shown


FigureCanvasAgg is non-interactive, and thus cannot be shown


FigureCanvasAgg is non-interactive, and thus cannot be shown



In [83]:
import pandas as pd

# Sample fake dataset with country info for testing
data = {
    "App": ["App A", "App B", "App C", "App D", "App E", "App F", "App G"],
    "Category": ["HEALTH", "TOOLS", "FINANCE", "GAMES", "PRODUCTIVITY", "BUSINESS", "FITNESS"],
    "Installs": [5000000, 1200000, 3000000, 700000, 2000000, 4000000, 100000],
    "Country": ["India", "USA", "Germany", "India", "UK", "France", "Japan"]
}

df = pd.DataFrame(data)

# Save as CSV for use in Streamlit
df.to_csv("sample_apps_with_country.csv", index=False)


In [None]:
!pip install pycountry

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from datetime import datetime
import pytz
import pycountry
import random


In [None]:
df_apps = pd.read_csv("Play Store Data.csv")
df_reviews = pd.read_csv("User Reviews.csv")


In [None]:
# Merge datasets
df_merged = df_apps.merge(df_reviews, on="App")

# Clean and filter apps with more than 1000 reviews
df_merged['Reviews'] = pd.to_numeric(df_merged['Reviews'], errors='coerce')
df_merged = df_merged[df_merged['Reviews'] > 1000]

# Create rating groups
def group_rating(rating):
    try:
        if rating <= 2:
            return '1-2 stars'
        elif rating <= 4:
            return '3-4 stars'
        else:
            return '4-5 stars'
    except:
        return None

df_merged['Rating Group'] = df_merged['Rating'].apply(group_rating)

# Top 5 categories only
top5_categories = df_merged['Category'].value_counts().head(5).index.tolist()
df_top = df_merged[df_merged['Category'].isin(top5_categories)]

# Group by Category, Rating Group, Sentiment
sentiment_counts = df_top.groupby(['Category', 'Rating Group', 'Sentiment']).size().reset_index(name='Count')

# Pivot to get stacked format
pivot_df = sentiment_counts.pivot_table(index=['Category', 'Rating Group'], columns='Sentiment', values='Count', fill_value=0).reset_index()

# Plot
fig_bar = px.bar(
    pivot_df,
    x="Category",
    y=["Positive", "Neutral", "Negative"],
    color_discrete_sequence=["green", "gray", "red"],
    barmode="stack",
    facet_col="Rating Group",
    title="Sentiment Distribution by Rating Group"
)
fig_bar.show()


In [None]:
from datetime import datetime
import pytz
import plotly.express as px
import pycountry
import pandas as pd

# Sample DataFrame for demonstration (replace with your actual df_top5)
df_top5 = pd.DataFrame({
    'Country': ['India', 'United States', 'Brazil', 'Germany', 'Australia'],
    'Installs': [5000, 3000, 2000, 1500, 1000],
    'App': ['App A', 'App B', 'App C', 'App D', 'App E'],
    'Category': ['Social', 'Games', 'Productivity', 'Health', 'Education']
})

# Helper function to convert country name to ISO-3
def get_iso3(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except:
        return None

# Clean DataFrame
df_top5 = df_top5.copy()
df_top5['iso_alpha'] = df_top5['Country'].apply(get_iso3)
df_top5 = df_top5.dropna(subset=['iso_alpha'])

# Show map only between 6 PM and 8 PM IST
ist = pytz.timezone('Asia/Kolkata')
now = datetime.now(ist)

if 18 <= now.hour < 20:
    fig = px.choropleth(
        df_top5,
        locations='iso_alpha',
        color='Installs',
        hover_name='App',
        animation_frame='Category',  # Ensure this column exists
        title='Global Installs by App Category (Visible only 6–8 PM IST)',
        color_continuous_scale='Blues',
    )
    fig.update_layout(geo=dict(showframe=False, showcoastlines=True))
    fig.show()
else:
    print("Choropleth map is only visible between 6 PM and 8 PM IST.")

In [None]:
pip install dash plotly pandas pycountry pytz

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import pytz
%matplotlib inline


In [None]:
# Load the dataset
df = pd.read_csv('Play Store Data.csv')

# Clean column names
df.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)

# Quick check of the dataset structure
df.head()

In [None]:
# Convert reviews to numeric (force errors to NaN)
df['reviews'] = pd.to_numeric(df['reviews'], errors='coerce')

# Drop rows where reviews could not be converted
df = df.dropna(subset=['reviews'])

# Keep only reviews > 500
df = df[df['reviews'] > 500]
df


In [None]:
# Keep categories starting with E, C, or B
df = df[df['category'].str.startswith(('E', 'C', 'B'))]

# Exclude apps starting with x, y, z and apps containing 'S' (case-insensitive)
mask = (
    ~df['app'].str.lower().str.startswith(tuple('xyz')) &
    ~df['app'].str.contains('S', case=False)
)
df = df[mask].copy()
df

In [None]:
# Define translations
translations = {
    'Beauty': 'सुंदरता',       # Hindi
    'Business': 'வர்த்தகம்',    # Tamil
    'Dating': 'Dating'          # German (or you can use "Verabredung")
}

# Apply translation
df['category_trans'] = df['category'].map(translations).fillna(df['category'])
df

In [None]:
# Clean installs: convert to numbers, handle 'free'
def parse_installs(x):
    if isinstance(x, str) and x.strip().lower() == 'free':
        return np.nan
    try:
        return float(x.replace(',', '').replace('+', ''))
    except:
        return np.nan

df['installs_clean'] = df['installs'].apply(parse_installs)

# Drop rows with NaN installs
df = df.dropna(subset=['installs_clean'])
df


In [None]:
# Convert 'last_updated' to datetime
df['last_updated'] = pd.to_datetime(df['last_updated'], errors='coerce')

# Drop rows where date is missing
df = df.dropna(subset=['last_updated'])

# Create 'month' column for monthly grouping
df['month'] = df['last_updated'].dt.to_period('M').dt.to_timestamp()
df

In [None]:
# Group by month and category
grp = (
    df.groupby(['month', 'category_trans'])['installs_clean']
    .sum()
    .reset_index()
)

# Calculate Month-over-Month % change
grp['pct_mom'] = grp.groupby('category_trans')['installs_clean'].pct_change()

# Identify growth > 20%
grp['grow20'] = grp['pct_mom'] > 0.20
grp

In [None]:
def plot_installs(grp_df):
    fig, ax = plt.subplots(figsize=(14, 7))
    
    for cat, subset in grp_df.groupby('category_trans'):
        ax.plot(subset['month'], subset['installs_clean'], label=cat)
        ax.fill_between(
            subset['month'],
            0, subset['installs_clean'],
            where=subset['grow20'],
            alpha=0.3,
            label=f'{cat} Growth > 20%'
        )
    
    ax.set_title('Monthly Total Installs by Category (Filtered & Translated)')
    ax.set_xlabel('Month')
    ax.set_ylabel('Total Installs')
    ax.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Time restriction: show only between 6 PM to 9 PM IST
now_ist = datetime.now(pytz.timezone('Asia/Kolkata'))
if 18 <= now_ist.hour < 21:
    plot_installs(grp)
else:
    print(" Chart visible only between 18:00–21:00 IST")


In [None]:
import os
import webbrowser
from plotly.io import to_html
from datetime import datetime
import pytz

# Your HTML dashboard template
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: auto;
            position: relative;
            cursor: pointer;
            background-color: #222;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""

# Set time limits
ist = pytz.timezone("Asia/Kolkata")
now = datetime.now(ist)
show_task2 = 18 <= now.hour < 20  # 6–8 PM
show_task3 = 18 <= now.hour < 21  # 6–9 PM

# Convert Plotly figures to HTML
task2_html = to_html(fig2, full_html=False, include_plotlyjs='cdn') if show_task2 else "<p style='color:red;'>Task 2 Choropleth Map is only visible between 6–8 PM IST.</p>"
task3_html = to_html(fig3, full_html=False, include_plotlyjs='cdn') if show_task3 else "<p style='color:red;'>Task 3 Time Series Chart is only visible between 6–9 PM IST.</p>"

# Combine plot containers
plot_containers = f"""
<div class="plot-container">
    <h2>Task 2: Choropleth Map</h2>
    {task2_html}
</div>
<div class="plot-container">
    <h2>Task 3: Time Series Trend Chart</h2>
    {task3_html}
</div>
"""

# Set plot dimensions
plot_width = 800
plot_height = 600

# Final HTML content
final_html = dashboard_html.format(
    plots=plot_containers,
    plot_width=plot_width,
    plot_height=plot_height
)

# Save and open in browser
html_files_path = "visualizations"
os.makedirs(html_files_path, exist_ok=True)
dashboard_path = os.path.join(html_files_path, "web page.html")

with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

webbrowser.open('file://' + os.path.realpath(dashboard_path))
