In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import squarify
import plotly.express as px
import seaborn as sns


In [None]:
play_store_df = pd.read_csv(r"C:\Users\monis\OneDrive\Desktop\NULL\Play Store Data.csv")
play_store_df


In [None]:
play_store_df.dtypes

In [None]:
review_df = pd.read_csv(r"C:\Users\monis\OneDrive\Desktop\NULL\User Reviews.csv")
review_df


In [None]:
review_df.dtypes

In [None]:
play_store_df = play_store_df.fillna(0)
play_store_df

In [None]:
review_df = review_df.fillna(0)
review_df

In [None]:
play_store_df['Installs'] = play_store_df['Installs'].str.replace(',', '')
play_store_df['Installs'] = play_store_df['Installs'].str.replace('+', '')

play_store_df['Installs'] = play_store_df['Installs'].str.replace('Free', '0')
play_store_df['Installs'] = play_store_df['Installs'].astype(int)
play_store_df

In [None]:

play_store_df.dtypes

In [None]:

merged_df = pd.merge(play_store_df, review_df, on='App', how='inner')
merged_df

In [None]:
merged_df.dtypes

In [None]:
def convert_size(size):
 
    if isinstance(size, str):  # Check if size is a string
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024
        elif 'Varies with device' in size:
            return np.nan  

  
    return size
play_store_df['Size'] = play_store_df['Size'].apply(convert_size)
merged_df['Size'] = merged_df['Size'].apply(convert_size)

In [None]:
def rating_group(rating):
    """
    Categorizes app ratings into groups based on their value.

    Args:
        rating (float): The rating of the app.

    Returns:
        str: The rating group ('Top rated app', 'Above average', 'Average', or 'Below Average').
    """
    if rating >= 4: 
        return 'Top rated app'  
    elif rating >= 3: 
        return 'Above average'  
    elif rating >= 2: 
        return 'Average' 
    else: 
        return 'Below Average'  

play_store_df['Rating_Group'] = play_store_df['Rating'].apply(rating_group)

In [None]:
play_store_df['Price'] = play_store_df['Price'].str.replace('$', '')

play_store_df['Price'] = play_store_df['Price'].str.replace('Everyone', '0')
play_store_df['Price'] = play_store_df['Price'].astype(float).astype(int)

In [None]:
play_store_df['Revenue'] = play_store_df['Price'] * play_store_df['Installs']

In [None]:

play_store_df['Last Updated'] = pd.to_datetime(play_store_df['Last Updated'], errors='coerce')

In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import nltk
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

merged_df['Sentiment_Score'] = merged_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [None]:
def sentiment_category(score):
  
    if score >= 0.05:  
        return 'Positive' 
    elif score <= -0.05:  
        return 'Negative'  
    else: 
        return 'Neutral'  

# Apply the

In [None]:
import plotly.express as px

fig = px.histogram(play_store_df, x='Category', y='Installs',color_discrete_sequence=['black'])
fig.show()

In [None]:

types_fig = px.pie(play_store_df, names='Type', title='Distribution of App Types')
types_fig.show()

In [None]:
rating_fig = px.histogram(play_store_df, x="Rating",
                   nbins=10, 
                   color_discrete_sequence=['black'], 
                   title='Distribution of App Ratings')
rating_fig.update_layout(bargap=0.1)  
rating_fig.show()

In [None]:
import plotly.express as px



top_reviews = merged_df['Translated_Review'].value_counts().head(20)


review_fig = px.bar(
    x=top_reviews.index, 
    y=top_reviews.values, 
    labels={'x': 'Translated_Review', 'y': 'Count'},
    title='Distribution of Top 20 Reviews',  
    color_discrete_sequence=['black']  
)


review_fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}, tickangle=45), 
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10),
    width=1000,  
    height=600   
)

review_fig.show()

In [None]:
installs_by_category = play_store_df.groupby('Category')['Installs'].sum().nlargest(10)


In [None]:
installs_by_category_fig = px.bar(
    x=installs_by_category.index,
    y=installs_by_category.values,
    labels={'x': 'Category', 'y': 'Total Installs'},
    title='Total Installs by Category (Top 10)',
    color_discrete_sequence=['black']

)

installs_by_category_fig.show()

In [None]:
import plotly.express as px


updates_per_year = play_store_df['Last Updated'].dt.year.value_counts().sort_index()


updates_per_year_fig = px.bar(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='App Updates per Year',
    color_discrete_sequence=['black']
)

updates_per_year_fig.show()

In [None]:
import plotly.express as px


avg_revenue_by_category = play_store_df.groupby('Category')['Revenue'].sum().nlargest(10)


avg_revenue_by_category_fig = px.bar(
    x=avg_revenue_by_category.index,
    y=avg_revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Average Revenue'},
    title='Average App Revenue by Category',
    color_discrete_sequence=['black']
)


avg_revenue_by_category_fig.show()

In [None]:
import plotly.express as px


genre_counts = play_store_df['Genres'].value_counts().nlargest(10)


genre_counts_fig = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Genre Count in Play Store Data',
    color_discrete_sequence=['black']
)

genre_counts_fig.show()

In [None]:
import plotly.express as px


lu_fig = px.scatter(
    play_store_df,
    x='Last Updated',
    y='Rating',
    title='Scatter Plot: Last Updated vs. Rating',
    color_discrete_sequence=['black']
)


lu_fig.show()

In [None]:
import plotly.express as px


tr_fig = px.box(
    play_store_df,
    x='Type',
    y='Rating',
    title='Rating for Paid vs Free Apps',
    color_discrete_sequence=['black']
)


tr_fig.show()

In [None]:
plt.figure(figsize=(18, 24))


plt.subplot(4, 2, 1)  
top_cats = play_store_df['Category'].value_counts().head(5)
wedges, texts, autotexts = plt.pie(
    top_cats, labels=top_cats.index, autopct='%1.1f%%', startangle=140
)
plt.gca().add_artist(plt.Circle((0, 0), 0.70, color='white'))
plt.title('Top 5 Categories (Donut)')


plt.subplot(4, 2, 2)  
top_apps = (
    merged_df.groupby('App')['Sentiment_Score']
    .sum()
    .sort_values(ascending=False)
    .head(5)
)
sns.barplot(x=top_apps.values, y=top_apps.index, palette='magma')
plt.title('Top 5 Apps by Reviews (Bar)')


plt.subplot(4, 2, 3) 
type_counts = play_store_df['Type'].value_counts()
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('App Type (Pie)')

#
plt.subplot(4, 2, 4)  
top_installs = (
    play_store_df.groupby('Category')['Installs']
    .sum()
    .sort_values(ascending=False)
    .head(5)
)

colors = ['red', 'blue', 'green', 'orange', 'purple']
squarify.plot(sizes=top_installs.values, label=top_installs.index, alpha=0.8, color=colors)
plt.axis('off')
plt.title('Top 5 Categories by Installs (Treemap)')


plt.subplot(4, 2, 5) 
sns.histplot(play_store_df['Rating'].dropna(), bins=10, kde=True)
plt.title('App Ratings (Histogram)')


plt.subplot(4, 2, 6)  
sns.boxplot(x='Content Rating', y='Rating', data=play_store_df)
plt.xticks(rotation=45, ha='right')  
plt.title('Rating Distribution by Content Rating (Box Plot)')


plt.subplot(4, 2, 7)  # New subplot
sns.scatterplot(x='Installs', y='Rating', data=play_store_df)
plt.title('Installs vs. Rating (Scatter Plot)')


plt.subplot(4, 2, 8) 

avg_rating_over_time = (
    play_store_df.groupby(pd.Grouper(key='Last Updated', freq='M'))['Rating']
    .mean()
    .reset_index()
)
sns.lineplot(x='Last Updated', y='Rating', data=avg_rating_over_time)
plt.xticks(rotation=45, ha='right') 
plt.title('Average Rating Over Time (Line Chart)')

plt.tight_layout()
plt.show()

In [None]:

# 1. Visualize the sentiment distribution (positive, neutral, negative) of user reviews using a stacked bar chart, 
# segmented by rating groups (e.g., 1-2 stars, 3-4 stars, 4-5 stars). Include only apps with more than 1,000 reviews and group by the top 5 categories.


merged_df['Reviews'] = merged_df['Reviews'].astype(int)


filtered_df = merged_df[merged_df['Reviews'] > 1000].copy()
filtered_df['Rating'] = pd.to_numeric(filtered_df['Rating'], errors='coerce')
filtered_df = filtered_df.dropna(subset=['Rating'])  # Drop rows with missing ratings

# Step 3: Define rating group categories
def categorize_rating_group(rating):
    if rating >= 4:
        return '4-5 stars'
    elif rating >= 3:
        return '3-4 stars'
    elif rating >= 1:
        return '1-2 stars'
    else:
        return 'Below 1 star'

filtered_df['Rating_Group'] = filtered_df['Rating'].apply(categorize_rating_group)


if len(filtered_df['Category'].unique()) >= 5:
    top_categories = filtered_df['Category'].value_counts().nlargest(5).index.tolist()
else:
    top_categories = filtered_df['Category'].unique().tolist()


filtered_df_top_cats = filtered_df[filtered_df['Category'].isin(top_categories)].copy()


def sentiment_category(score):
    if pd.isna(score):
        return 'Neutral'
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'

filtered_df_top_cats['Sentiment_Score'] = pd.to_numeric(filtered_df_top_cats['Sentiment_Score'], errors='coerce')
filtered_df_top_cats['Sentiment_Category'] = filtered_df_top_cats['Sentiment_Score'].apply(sentiment_category)


sentiment_fig = go.Figure()

rating_order = ['Below 1 star', '1-2 stars', '3-4 stars', '4-5 stars']
sentiments = ['Positive', 'Neutral', 'Negative']
colors = {'Positive': 'darkgreen', 'Neutral': 'gray', 'Negative': 'darkred'}


for i, category in enumerate(top_categories):
    data_cat = filtered_df_top_cats[filtered_df_top_cats['Category'] == category]
    grouped = data_cat.groupby(['Rating_Group', 'Sentiment_Category']).size().unstack(fill_value=0).reindex(rating_order).fillna(0)

    for sentiment in sentiments:
        if sentiment in grouped.columns:
            sentiment_fig.add_trace(go.Bar(
                x=grouped.index,
                y=grouped[sentiment],
                name=sentiment,
                marker_color=colors[sentiment],
                visible=(i == 0),
                offsetgroup=category,
                legendgroup=sentiment,
                showlegend=(i == 0)
            ))


buttons = []
for i, category in enumerate(top_categories):
    visibility = []
    for j in range(len(top_categories)):
        visibility.extend([j == i] * len(sentiments))

    buttons.append(dict(
        label=category,
        method='update',
        args=[
            {'visible': visibility},
            {'title': f'Sentiment Distribution by Rating Group — {category}'}
        ]
    ))


sentiment_fig.update_layout(
    title=f"Sentiment Distribution by Rating Group — {top_categories[0] if top_categories else 'No Data'}",
    xaxis_title="Rating Group",
    yaxis_title="App Count",
    barmode="stack",
    width=1000,
    height=600,
    updatemenus=[dict(
        active=0,
        buttons=buttons,
        x=0.5,
        xanchor="center",
        y=1.15,
        yanchor="top"
    )],
    legend_title="Sentiment",
    plot_bgcolor='white',
    paper_bgcolor='white',
    font_color='black'
)


sentiment_fig.show()


In [None]:
# 2. Create an interactive Choropleth map using Plotly to visualize global installs by Category.
# Apply filters to show data for only the top 5 app categories and highlight category where the number of installs exceeds 1 million.
# The app category should not start with the characters “A,” “C,” “G,” or “S.” 
# This graph should work only between 6 PM IST and 8 PM IST; apart from that time, we should not show it in the dashboard itself
import plotly.express as px
from datetime import datetime
import pytz


ist = pytz.timezone('Asia/Kolkata')
current_time_ist = datetime.now(ist)


if 18 <= current_time_ist.hour < 20:

   
    play_store_df['Installs'] = pd.to_numeric(play_store_df['Installs'], errors='coerce')

   
    filtered_df = play_store_df[play_store_df['Installs'] > 0].copy()

   
    installs_by_category = filtered_df.groupby('Category')['Installs'].sum().reset_index()
    top_5_categories = installs_by_category.nlargest(5, 'Installs')['Category'].tolist()

    
    filtered_top_5 = filtered_df[filtered_df['Category'].isin(top_5_categories)].copy()

    # Step 5: Exclude categories starting with A, C, G, or S
    filtered_top_5 = filtered_top_5[~filtered_top_5['Category'].str.startswith(('A', 'C', 'G', 'S'), na=False)].copy()

  
    if 'Country_Code' not in filtered_top_5.columns:
        filtered_top_5['Country_Code'] = 'United States'

    
    filtered_top_5['Highlight'] = filtered_top_5['Installs'].apply(
        lambda x: 'Over 1 Million Installs' if x > 1_000_000 else '1 Million Installs or Less'
    )

   
    choropleth_fig = px.choropleth(
        filtered_top_5,
        locations="Country_Code",
        locationmode="country names",  # or 'ISO-3' if using ISO codes
        color="Installs",
        hover_name="Category",
        title="Global Installs by Top 5 App Categories (Excluding A, C, G, S)",
        color_continuous_scale="Viridis",
        range_color=(0, filtered_top_5['Installs'].max()),
        facet_col="Highlight",
        facet_col_wrap=2,
        projection="natural earth"
    )

    choropleth_fig.show()

else:
   
    print("Choropleth map is only available between 6 PM and 8 PM IST.")


In [None]:
# 3. Plot a time series line chart to show the trend of total installs over time, segmented by app category.
# Highlight periods of significant growth by shading the areas under the curve where the increase in installs exceeds 20% month-over-month 
# and app name should not starts with x, y ,z and app category should start with letter " E " or " C " or " B " and 
# We have to translate the Beauty category in Hindi and Business category in Tamil and Dating category in German while showing it on Graph. 
# reviews should be more than 500 the app name should not contain letter "S" as well as this graph should work only 
# between 6 PM IST to 9 PM IST apart from that time we should not show this graph in dashboard itself
import pandas as pd
import plotly.graph_objects as go
from datetime import datetime
import pytz


ist = pytz.timezone('Asia/Kolkata')


current_time_ist = datetime.now(ist)


start_time = current_time_ist.replace(hour=18, minute=0, second=0, microsecond=0)
end_time = current_time_ist.replace(hour=21, minute=0, second=0, microsecond=0)

# 
if start_time <= current_time_ist <= end_time:

    print("Generating time series plot as it's within the allowed time range (6 PM - 9 PM IST).")

    
    play_store_df['Last Updated'] = pd.to_datetime(play_store_df['Last Updated'], errors='coerce')
    play_store_df['Installs'] = pd.to_numeric(play_store_df['Installs'], errors='coerce')

   
    merged_df['Reviews'] = pd.to_numeric(merged_df['Reviews'], errors='coerce')

  
    combined_df = pd.merge(play_store_df, merged_df[['App', 'Reviews']], on='App', how='left')

   
    combined_df = combined_df.dropna(subset=['Last Updated', 'Installs', 'Reviews'])

 
    filtered_df = combined_df[
        (combined_df['Reviews'] > 500) & 
        (~combined_df['App'].str.startswith(('x', 'y', 'z'), na=False, case=False)) & 
        (~combined_df['App'].str.contains('S', na=False, case=False)) & 
        (combined_df['Category'].str.startswith(('E', 'C', 'B'), na=False, case=False))
    ].copy()  # Use .copy() to avoid SettingWithCopyWarning

    if filtered_df.empty:
        print("No data available after applying filters. Cannot generate plot.")
    else:
       
        filtered_df = filtered_df.set_index('Last Updated')
        monthly_installs = filtered_df.groupby('Category').resample('M')['Installs'].sum().reset_index()

      
        monthly_installs['MoM_Installs'] = monthly_installs.groupby('Category')['Installs'].pct_change() * 100

        
        category_translations = {
            'BEAUTY': 'सुंदरता',  # Beauty in Hindi
            'BUSINESS': 'வணிகம்', # Business in Tamil
            'DATING': 'Dating'    # Dating in German (same as English)
        }
        
        
        monthly_installs['Category_Translated'] = monthly_installs['Category'].apply(
            lambda x: category_translations.get(x.upper(), x)  # Use .upper() for case-insensitive match
        )

      
        time_fig = go.Figure()

        
        for category in monthly_installs['Category'].unique():
            data_cat = monthly_installs[monthly_installs['Category'] == category]

            # Add line trace
            time_fig.add_trace(go.Scatter(
                x=data_cat['Last Updated'],
                y=data_cat['Installs'],
                mode='lines',
                name=data_cat['Category_Translated'].iloc[0],  # Use translated name for legend
                line=dict(width=2),
                hovertemplate='Category: %{full_data.name}<br>Date: %{x|%Y-%m-%d}<br>Installs: %{y}<extra></extra>'
            ))

          
            growth_periods = data_cat[data_cat['MoM_Installs'] > 20]

         
            for i in range(len(growth_periods)):
                
                current_date = growth_periods['Last Updated'].iloc[i]
                previous_date = current_date - pd.DateOffset(months=1)

              
                current_installs = growth_periods['Installs'].iloc[i]
                previous_installs_row = data_cat[data_cat['Last Updated'] == previous_date]

                if not previous_installs_row.empty:
                    previous_installs = previous_installs_row['Installs'].iloc[0]

                    time_fig.add_shape(type="path",
                        fillcolor="rgba(255, 0, 0, 0.3)",  # Semi-transparent red shade
                        line=dict(color="rgba(0,0,0,0)"),  # No border line
                        path=f"M {previous_date.strftime('%Y-%m-%d')} 0 L {previous_date.strftime('%Y-%m-%d')} {previous_installs} L {current_date.strftime('%Y-%m-%d')} {current_installs} L {current_date.strftime('%Y-%m-%d')} 0 Z",
                        layer="below",  # Place shape below the line
                        name='Significant Growth (>20% MoM)',  # Name for hover info
                        showlegend=True if i == 0 and category == monthly_installs['Category'].unique()[0] else False  # Show legend only once
                    )
                else:
                   
                     pass

     
        time_fig.update_layout(
            title='Trend of Total Installs Over Time by Category (Filtered)',
            xaxis_title='Date',
            yaxis_title='Total Installs',
            hovermode='x unified',  
            xaxis=dict(
                rangeselector=dict(
                    buttons=list([
                        dict(count=1, label="1m", step="month", stepmode="backward"),
                        dict(count=6, label="6m", step="month", stepmode="backward"),
                        dict(count=1, label="YTD", step="year", stepmode="todate"),
                        dict(count=1, label="1y", step="year", stepmode="backward"),
                        dict(step="all")
                    ])
                ),
                rangeslider=dict(visible=True),
                type="date"
            ),
            
            plot_bgcolor='rgba(0,0,0,0.8)',  #
            paper_bgcolor='rgba(0,0,0,0.9)',  # 
            font_color='white'  #
        )

        time_fig.show()

else:
   
    print(f"Current time in IST ({current_time_ist.strftime('%H:%M')}) is outside the allowed range (6 PM - 9 PM IST). Plot will not be generated.")
