In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


## Load the **Data**

In [None]:
# Load the datasets
apps_df = pd.read_csv('apps.csv')
user_reviews_df = pd.read_csv('user_reviews.csv')

# Display first 5 rows of each dataframe
print("Apps DataFrame:")
display(apps_df.head())


print("\nUser Reviews DataFrame:")
display(user_reviews_df.head())

Apps DataFrame:


Unnamed: 0.1,Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up



User Reviews DataFrame:


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


## Data Preparation and **Cleaning**

In [None]:
# Clean the apps dataset
def clean_apps_data(df):
    """Cleans the apps dataset."""
    # Drop duplicates
    df.drop_duplicates(subset='App', inplace=True)

    # Handle missing ratings by filling with the mean
    mean_rating = df['Rating'].mean()
    df['Rating'].fillna(mean_rating, inplace=True)

    # Clean and convert 'Installs' to numeric
    df['Installs'] = df['Installs'].apply(lambda x: str(x).replace('+', '') if '+' in str(x) else str(x))
    df['Installs'] = df['Installs'].apply(lambda x: str(x).replace(',', '') if ',' in str(x) else str(x))
    df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

    # Clean and convert 'Size' to numeric (in MB)
    df['Size'] = df['Size'].apply(lambda x: str(x).replace('M', 'e6') if 'M' in str(x) else str(x))
    df['Size'] = df['Size'].apply(lambda x: str(x).replace('k', 'e3') if 'k' in str(x) else str(x))
    df['Size'] = df['Size'].replace('Varies with device', np.nan)
    df['Size'] = pd.to_numeric(df['Size'], errors='coerce')
    mean_size = df['Size'].mean()
    df['Size'].fillna(mean_size, inplace=True)
    df['Size'] = df['Size'] / 1e6 # Convert to MB

    # Clean and convert 'Price' to numeric
    df['Price'] = df['Price'].apply(lambda x: str(x).replace('$', '') if '$' in str(x) else str(x))
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

    # Convert 'Reviews' to numeric
    df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')

    # Convert 'Last Updated' to datetime
    df['Last Updated'] = pd.to_datetime(df['Last Updated'], errors='coerce')

    return df

apps_df = clean_apps_data(apps_df)

# Clean the user reviews dataset
user_reviews_df.dropna(subset=['Translated_Review'], inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





## Category **Exploration**

In [None]:
# Get the distribution of apps across categories
category_dist = apps_df['Category'].value_counts().reset_index()
category_dist.columns = ['Category', 'Count']

# Create an interactive bar chart
fig_category = px.bar(category_dist, x='Category', y='Count',
                      title='App Distribution Across Categories',
                      labels={'Count': 'Number of Apps'},
                      color='Category')
fig_category.show()

## Metrics **Analysis**

In [None]:
# Create a scatter plot to analyze the relationship between Rating, Size, and Installs
fig_metrics = px.scatter(apps_df, x='Size', y='Rating',
                         size='Installs', color='Category',
                         title='Rating vs. Size, Colored by Category',
                         labels={'Size': 'Size (MB)', 'Rating': 'App Rating'},
                         hover_name='App', size_max=60)
fig_metrics.show()

# Analyze the distribution of app prices
paid_apps = apps_df[apps_df['Price'] > 0]
fig_price = px.histogram(paid_apps, x='Price', nbins=50,
                         title='Distribution of App Prices',
                         labels={'Price': 'Price (USD)'})
fig_price.show()

## Sentiment **Analysis**

In [None]:
# Get the distribution of sentiments
sentiment_dist = user_reviews_df['Sentiment'].value_counts().reset_index()
sentiment_dist.columns = ['Sentiment', 'Count']

# Create a pie chart for sentiment distribution
fig_sentiment = px.pie(sentiment_dist, values='Count', names='Sentiment',
                       title='Distribution of User Sentiments',
                       color_discrete_sequence=px.colors.sequential.RdBu)
fig_sentiment.show()

# Merge the two datasets to analyze sentiment by category
merged_df = pd.merge(apps_df, user_reviews_df, on='App')

# Get the sentiment distribution for each category
sentiment_by_category = merged_df.groupby(['Category', 'Sentiment']).size().reset_index(name='Count')

# Create a stacked bar chart
fig_sentiment_category = px.bar(sentiment_by_category, x='Category', y='Count',
                                color='Sentiment', title='Sentiment Distribution by App Category',
                                labels={'Count': 'Number of Reviews'},
                                barmode='stack')
fig_sentiment_category.show()

## Enhanced Interactive **Visualization**

In [None]:
# Create a subplot to show multiple visualizations together
fig_dashboard = make_subplots(
    rows=2, cols=2,
    specs=[[{'type': 'bar'}, {'type': 'pie'}],
           [{'type': 'scatter', 'colspan': 2}, None]],
    subplot_titles=('Top 10 App Categories', 'Sentiment Distribution', 'Rating vs. Reviews for Top Categories')
)

# Top 10 categories
top_10_categories = category_dist.nlargest(10, 'Count')
fig_dashboard.add_trace(go.Bar(x=top_10_categories['Category'], y=top_10_categories['Count'], name='Top Categories'),
                        row=1, col=1)

# Sentiment distribution
fig_dashboard.add_trace(go.Pie(labels=sentiment_dist['Sentiment'], values=sentiment_dist['Count'], name='Sentiments'),
                        row=1, col=2)

# Rating vs. Reviews for top categories
top_categories_list = top_10_categories['Category'].tolist()
filtered_apps = apps_df[apps_df['Category'].isin(top_categories_list)]
fig_dashboard.add_trace(go.Scatter(x=filtered_apps['Reviews'], y=filtered_apps['Rating'],
                                   mode='markers',
                                   marker=dict(color=filtered_apps['Installs'],
                                               showscale=True,
                                               colorscale='Viridis',
                                               size=10),
                                   text=filtered_apps['App']),
                        row=2, col=1)

# Update layout
fig_dashboard.update_layout(height=800, showlegend=False, title_text="Google Play Store Market Analysis")
fig_dashboard.show()

print("\nAnalysis Complete!")


Analysis Complete!
