In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('google_play_store_dataset.csv')

In [3]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [5]:
df.shape

(10841, 13)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [7]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [8]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [15]:
df['Category'].value_counts()

Category
FAMILY                 1972
GAME                   1144
TOOLS                   843
MEDICAL                 463
BUSINESS                460
PRODUCTIVITY            424
PERSONALIZATION         392
COMMUNICATION           387
SPORTS                  384
LIFESTYLE               382
FINANCE                 366
HEALTH_AND_FITNESS      341
PHOTOGRAPHY             335
SOCIAL                  295
NEWS_AND_MAGAZINES      283
SHOPPING                260
TRAVEL_AND_LOCAL        258
DATING                  234
BOOKS_AND_REFERENCE     231
VIDEO_PLAYERS           175
EDUCATION               156
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     137
FOOD_AND_DRINK          127
HOUSE_AND_HOME           88
LIBRARIES_AND_DEMO       85
AUTO_AND_VEHICLES        85
WEATHER                  82
ART_AND_DESIGN           65
EVENTS                   64
PARENTING                60
COMICS                   60
BEAUTY                   53
1.9                       1
Name: count, dtype: int64

In [14]:
len(df['Category'].value_counts())

34

In [19]:
import re

# Define a regular expression pattern for valid numbers (e.g., "$4.50" or "4.50")
pattern = r'^\$?\d+(\.\d+)?$'

# Use the pattern to filter out truly non-numeric rows
non_numeric_prices = df[~df['Price'].str.match(pattern)]

# Display the rows with non-numeric 'Price' values
print(non_numeric_prices)


                                           App Category  Rating Reviews  \
10472  Life Made WI-Fi Touchscreen Photo Frame      1.9    19.0    3.0M   

         Size Installs Type     Price Content Rating             Genres  \
10472  1,000+     Free    0  Everyone            NaN  February 11, 2018   

      Last Updated Current Ver Android Ver  
10472       1.0.19  4.0 and up         NaN  


In [None]:
# Replace specific non-numeric values like "Everyone" with 0
df['Price'] = df['Price'].replace("Everyone", 0)

# Now you can convert the 'Price' column to numeric, if needed
df['Price'] = df['Price'].str.replace('$', '').astype(float)

In [23]:
df['Price'].dtype

dtype('float64')

In [None]:
df['Size'] = df['Size'].replace("Varies with device", 0)

In [24]:
df['Size'] = df['Size'].str.replace('M', '').str.replace('k', '').astype(float)

ValueError: could not convert string to float: 'Varies with device'

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime

def analyze_playstore_data(df):
    """
    Comprehensive analysis of Google Play Store apps data
    
    Parameters:
    df (pandas.DataFrame): Play Store apps dataset
    
    Returns:
    dict: Dictionary containing various analysis results
    """
    # Clean and prepare data
    df['Price'] = df['Price'].str.replace('$', '').astype(float)
    df['Size'] = df['Size'].str.replace('M', '').str.replace('k', '').astype(float)
    df['Installs'] = df['Installs'].str.replace('+', '').str.replace(',', '').astype(int)
    
    # Initialize results dictionary
    insights = {}
    
    # 1. Category Analysis
    category_stats = df.groupby('Category').agg({
        'Rating': ['mean', 'count'],
        'Reviews': 'sum',
        'Installs': 'sum',
        'Price': 'mean'
    }).round(2)
    
    insights['top_categories'] = {
        'by_apps': category_stats['Rating']['count'].nlargest(5).to_dict(),
        'by_installs': category_stats['Installs'].nlargest(5).to_dict(),
        'by_rating': category_stats['Rating']['mean'].nlargest(5).to_dict()
    }
    
    # 2. Monetization Analysis
    insights['monetization'] = {
        'paid_apps_percentage': (df['Price'] > 0).mean() * 100,
        'avg_price_paid_apps': df[df['Price'] > 0]['Price'].mean(),
        'price_rating_correlation': df['Price'].corr(df['Rating']),
        'top_paid_categories': df[df['Price'] > 0].groupby('Category')['Price'].mean().nlargest(5).to_dict()
    }
    
    # 3. Performance Metrics
    insights['performance'] = {
        'avg_rating': df['Rating'].mean(),
        'rating_distribution': df['Rating'].value_counts(bins=5).to_dict(),
        'size_installs_correlation': df['Size'].corr(df['Installs']),
        'top_rated_genres': df.groupby('Genres')['Rating'].mean().nlargest(5).to_dict()
    }
    
    # 4. Content Rating Analysis
    content_rating_stats = df.groupby('Content Rating').agg({
        'Rating': 'mean',
        'Installs': 'sum',
        'App': 'count'
    }).round(2)
    
    insights['content_rating'] = {
        'distribution': content_rating_stats['App'].to_dict(),
        'avg_ratings': content_rating_stats['Rating'].to_dict(),
        'total_installs': content_rating_stats['Installs'].to_dict()
    }
    
    return insights

def generate_insights_summary(insights):
    """
    Generate a human-readable summary of the insights
    """
    summary = []
    
    # Category insights
    summary.append("📱 Category Insights:")
    summary.append(f"- Most populated categories: {', '.join(f'{k} ({v:,} apps)' for k, v in insights['top_categories']['by_apps'].items())}")
    summary.append(f"- Highest rated categories: {', '.join(f'{k} ({v:.2f}★)' for k, v in insights['top_categories']['by_rating'].items())}")
    
    # Monetization insights
    summary.append("\n💰 Monetization Insights:")
    summary.append(f"- Paid apps: {insights['monetization']['paid_apps_percentage']:.1f}% of all apps")
    summary.append(f"- Average price of paid apps: ${insights['monetization']['avg_price_paid_apps']:.2f}")
    summary.append(f"- Price-Rating correlation: {insights['monetization']['price_rating_correlation']:.3f}")
    
    # Performance insights
    summary.append("\n⭐ Performance Insights:")
    summary.append(f"- Average rating across all apps: {insights['performance']['avg_rating']:.2f}★")
    summary.append(f"- Size-Installs correlation: {insights['performance']['size_installs_correlation']:.3f}")
    
    # Content rating insights
    summary.append("\n🔒 Content Rating Insights:")
    for rating, count in insights['content_rating']['distribution'].items():
        avg_rating = insights['content_rating']['avg_ratings'][rating]
        summary.append(f"- {rating}: {count:,} apps (avg rating: {avg_rating:.2f}★)")
    
    return "\n".join(summary)