In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import squarify
import plotly.express as px
import seaborn as sns


In [None]:
play_store_df = pd.read_csv(r"C:\Users\monis\OneDrive\Desktop\NULL\Play Store Data.csv")
play_store_df


In [None]:
play_store_df.dtypes

In [None]:
review_df = pd.read_csv(r"C:\Users\monis\OneDrive\Desktop\NULL\User Reviews.csv")
review_df


In [None]:
review_df.dtypes

In [None]:
play_store_df = play_store_df.fillna(0)
play_store_df

In [None]:
review_df = review_df.fillna(0)
review_df

In [None]:
play_store_df['Installs'] = play_store_df['Installs'].str.replace(',', '')
play_store_df['Installs'] = play_store_df['Installs'].str.replace('+', '')
# Replace 'Free' with '0' before converting to int
play_store_df['Installs'] = play_store_df['Installs'].str.replace('Free', '0')
play_store_df['Installs'] = play_store_df['Installs'].astype(int)
play_store_df

In [None]:
#data types after converting installs column to numeric
play_store_df.dtypes

In [None]:
# Assuming 'App' is the common column in both DataFrames
merged_df = pd.merge(play_store_df, review_df, on='App', how='inner')
merged_df

In [None]:
merged_df.dtypes

In [None]:
def convert_size(size):
    """
    Converts app size strings to numerical values in megabytes (MB).

    Args:
        size (str): The app size string (e.g., '10M', '2.5k').

    Returns:
        float: The app size in megabytes (MB) or np.nan if invalid format.
    """
    if isinstance(size, str):  # Check if size is a string
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024
        elif 'Varies with device' in size:
            return np.nan  # Handle 'Varies with device' as NaN

    # If not a string or doesn't match any pattern, return as is (assuming it's already numeric)
    return size

# Apply the convert_size function to the 'Size' column of play_store_df and merged_df
play_store_df['Size'] = play_store_df['Size'].apply(convert_size)
merged_df['Size'] = merged_df['Size'].apply(convert_size)

In [None]:
def rating_group(rating):
    """
    Categorizes app ratings into groups based on their value.

    Args:
        rating (float): The rating of the app.

    Returns:
        str: The rating group ('Top rated app', 'Above average', 'Average', or 'Below Average').
    """
    if rating >= 4:  # Check if rating is 4 or higher
        return 'Top rated app'  # Return 'Top rated app' if condition is True
    elif rating >= 3:  # Check if rating is 3 or higher
        return 'Above average'  # Return 'Above average' if condition is True
    elif rating >= 2:  # Check if rating is 2 or higher
        return 'Average'  # Return 'Average' if condition is True
    else:  # If all previous conditions are False
        return 'Below Average'  # Return 'Below Average'


# Apply the rating_group function to the 'Rating' column of the play_store_df DataFrame
# and store the results in a new column called 'Rating_Group'
play_store_df['Rating_Group'] = play_store_df['Rating'].apply(rating_group)

In [None]:
play_store_df['Price'] = play_store_df['Price'].str.replace('$', '')
# Replace 'Everyone' with '0' before converting to int
play_store_df['Price'] = play_store_df['Price'].str.replace('Everyone', '0')
play_store_df['Price'] = play_store_df['Price'].astype(float).astype(int)  # First convert to float to handle potential decimal points, then convert to integer

In [None]:
play_store_df['Revenue'] = play_store_df['Price'] * play_store_df['Installs']

In [None]:
# Convert 'Last Updated' column to datetime
# pd.to_datetime is used to convert the column to datetime format
# errors='coerce' handles invalid date formats by setting them to NaT (Not a Time)
play_store_df['Last Updated'] = pd.to_datetime(play_store_df['Last Updated'], errors='coerce')

In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import nltk
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

merged_df['Sentiment_Score'] = merged_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [None]:
def sentiment_category(score):
    """
    Categorizes sentiment scores into Positive, Negative, or Neutral.

    Args:
        score (float): The sentiment score (compound score from SentimentIntensityAnalyzer).

    Returns:
        str: The sentiment category ('Positive', 'Negative', or 'Neutral').
    """
    if score >= 0.05:  # Check if score is positive (greater than or equal to 0.05)
        return 'Positive'  # Return 'Positive' if condition is True
    elif score <= -0.05:  # Check if score is negative (less than or equal to -0.05)
        return 'Negative'  # Return 'Negative' if condition is True
    else:  # If neither positive nor negative (score is between -0.05 and 0.05)
        return 'Neutral'  # Return 'Neutral'

# Apply the

In [None]:
import plotly.express as px

fig = px.histogram(play_store_df, x='Category', y='Installs',color_discrete_sequence=['black'])
fig.show()

In [None]:
# Assuming your dataframe is named 'play_store_df'
types_fig = px.pie(play_store_df, names='Type', title='Distribution of App Types')
types_fig.show()

In [None]:
rating_fig = px.histogram(play_store_df, x="Rating",
                   nbins=10,  # Adjust the number of bins as needed
                   color_discrete_sequence=['black'], # Set color to black
                   title='Distribution of App Ratings')
rating_fig.update_layout(bargap=0.1)  # Adjust bar gap for better visualization
rating_fig.show()

In [None]:
import plotly.express as px

# Assuming 'Translated_Review' contains the review text
# Get top 20 most frequent reviews
top_reviews = merged_df['Translated_Review'].value_counts().head(20)

# Create the bar graph using top reviews
review_fig = px.bar(
    x=top_reviews.index,  # Unique review values
    y=top_reviews.values,  # Frequency of each review
    labels={'x': 'Translated_Review', 'y': 'Count'},
    title='Distribution of Top 20 Reviews',  # Updated title
    color_discrete_sequence=['black']  # Optional: Set bar color to black
)

# Update layout for dark theme, customizations, and rotated x-axis labels
review_fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}, tickangle=45),  # Rotate x-axis labels
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10),
    width=1000,  # Increase width for better readability
    height=600   # Increase height for better readability
)

review_fig.show()

In [None]:
installs_by_category = play_store_df.groupby('Category')['Installs'].sum().nlargest(10)


In [None]:
installs_by_category_fig = px.bar(
    x=installs_by_category.index,
    y=installs_by_category.values,
    labels={'x': 'Category', 'y': 'Total Installs'},
    title='Total Installs by Category (Top 10)',
    color_discrete_sequence=['black']

)

installs_by_category_fig.show()

In [None]:
import plotly.express as px

# Extract year from 'Last Updated' column and count updates per year
updates_per_year = play_store_df['Last Updated'].dt.year.value_counts().sort_index()

# Create the bar graph
updates_per_year_fig = px.bar(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='App Updates per Year',
    color_discrete_sequence=['black']
)

updates_per_year_fig.show()

In [None]:
import plotly.express as px

# Calculate average rating by category
avg_revenue_by_category = play_store_df.groupby('Category')['Revenue'].sum().nlargest(10)

# Create the bar graph
avg_revenue_by_category_fig = px.bar(
    x=avg_revenue_by_category.index,
    y=avg_revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Average Revenue'},
    title='Average App Revenue by Category',
    color_discrete_sequence=['black']
)

# Show the graph
avg_revenue_by_category_fig.show()

In [None]:
import plotly.express as px

# Count the occurrences of each genre
genre_counts = play_store_df['Genres'].value_counts().nlargest(10)

# Create the bar graph
genre_counts_fig = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Genre Count in Play Store Data',
    color_discrete_sequence=['black']
)

# Show the graph
genre_counts_fig.show()

In [None]:
import plotly.express as px

# Create the scatter plot
lu_fig = px.scatter(
    play_store_df,
    x='Last Updated',
    y='Rating',
    title='Scatter Plot: Last Updated vs. Rating',
    color_discrete_sequence=['black']
)

# Show the plot
lu_fig.show()

In [None]:
import plotly.express as px

# Create the box plot
tr_fig = px.box(
    play_store_df,
    x='Type',
    y='Rating',
    title='Rating for Paid vs Free Apps',
    color_discrete_sequence=['black']
)

# Show the plot
tr_fig.show()

In [None]:
plt.figure(figsize=(18, 24))

# 1. Donut Chart: Top 5 Categories
plt.subplot(4, 2, 1)  # Adjusted subplot grid
top_cats = play_store_df['Category'].value_counts().head(5)
wedges, texts, autotexts = plt.pie(
    top_cats, labels=top_cats.index, autopct='%1.1f%%', startangle=140
)
plt.gca().add_artist(plt.Circle((0, 0), 0.70, color='white'))
plt.title('Top 5 Categories (Donut)')

# 2. Bar Chart: Top 5 Apps by Reviews
plt.subplot(4, 2, 2)  # Adjusted subplot grid
top_apps = (
    merged_df.groupby('App')['Sentiment_Score']
    .sum()
    .sort_values(ascending=False)
    .head(5)
)
sns.barplot(x=top_apps.values, y=top_apps.index, palette='magma')
plt.title('Top 5 Apps by Reviews (Bar)')

# 3. Pie Chart: Type Distribution
plt.subplot(4, 2, 3)  # Adjusted subplot grid
type_counts = play_store_df['Type'].value_counts()
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('App Type (Pie)')

# 4. Treemap: Top 5 Categories by Installs
plt.subplot(4, 2, 4)  # Adjusted subplot grid
top_installs = (
    play_store_df.groupby('Category')['Installs']
    .sum()
    .sort_values(ascending=False)
    .head(5)
)
# Generate a list of colors (you can customize these)
colors = ['red', 'blue', 'green', 'orange', 'purple']
squarify.plot(sizes=top_installs.values, label=top_installs.index, alpha=0.8, color=colors)
plt.axis('off')
plt.title('Top 5 Categories by Installs (Treemap)')

# 5. Histogram: App Ratings
plt.subplot(4, 2, 5)  # Adjusted subplot grid
sns.histplot(play_store_df['Rating'].dropna(), bins=10, kde=True)
plt.title('App Ratings (Histogram)')

# 6. Box Plot: Rating Distribution by Content Rating
plt.subplot(4, 2, 6)  # New subplot
sns.boxplot(x='Content Rating', y='Rating', data=play_store_df)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.title('Rating Distribution by Content Rating (Box Plot)')

# 7. Scatter Plot: Installs vs. Rating
plt.subplot(4, 2, 7)  # New subplot
sns.scatterplot(x='Installs', y='Rating', data=play_store_df)
plt.title('Installs vs. Rating (Scatter Plot)')

# 8. Line Chart: Average Rating Over Time
plt.subplot(4, 2, 8)  # New subplot
# Assuming 'Last Updated' is already in datetime format
avg_rating_over_time = (
    play_store_df.groupby(pd.Grouper(key='Last Updated', freq='M'))['Rating']
    .mean()
    .reset_index()
)
sns.lineplot(x='Last Updated', y='Rating', data=avg_rating_over_time)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.title('Average Rating Over Time (Line Chart)')

plt.tight_layout()
plt.show()