In [4]:
#import

import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pytz
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download 
nltk.download('vader_lexicon', quiet=True)

#data loading
def load_data(apps_file: str, reviews_file: str) -> tuple:
    """
    Load CSV files into DataFrames.
    
    Args:
        apps_file: Path to the apps CSV file
        reviews_file: Path to the reviews CSV file
    
    Returns:
        Tuple of (apps_df, reviews_df)
    """
    apps_df = pd.read_csv(apps_file)
    reviews_df = pd.read_csv(reviews_file)
    return apps_df, reviews_df
#data set
apps_df, reviews_df = load_data("Play Store Data.csv", "Review.csv")

print("=" * 60)
print("APPS DATA OVERVIEW")
print("=" * 60)
print(apps_df.head())
print("\nData Types:\n", apps_df.dtypes)

print("\n" + "=" * 60)
print("REVIEWS DATA OVERVIEW")
print("=" * 60)
print(reviews_df.head())
print("\nData Types:\n",reviews_df.dtypes) 
#data cleaning
def clean_apps_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and preprocess apps data.
    
    Args:
        df: Input DataFrame
    
    Returns:
        Cleaned DataFrame
    """
    df = df.copy()
    
    # Remove rows with missing ratings
    df = df.dropna(subset=['Rating'])
    
    # Fill remaining missing values with mode
    for col in df.columns:
        if df[col].isnull().any():
            fill_value = (df[col].mode()[0] 
                         if not df[col].mode().empty 
                         else df[col].iloc[0])
            df[col] = df[col].fillna(fill_value)
    
    # Validate rating range
    df = df[df['Rating'] <= 5.0]
    
    # Clean Installs column
    df['Installs'] = (
        df['Installs']
        .astype(str)
        .str.replace('[+,]', '', regex=True)
    )
    df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')
    
    # Clean Price column
    df['Price'] = (
        df['Price']
        .astype(str)
        .str.replace('$', '', regex=False)
    )
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
    
    return df


def clean_reviews_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean reviews data.
    
    Args:
        df: Input DataFrame
    
    Returns:
        Cleaned DataFrame
    """
    df = df.copy()
    df = df.dropna(subset=['Reviews'])
    return df


# Apply cleaning functions
apps_df = clean_apps_data(apps_df)
reviews_df = clean_reviews_data(reviews_df)

print("\n✓ Data cleaning completed successfully")# data merging

merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')
print(f"\n✓ Datasets merged: {len(merged_df)} matching apps found")
print(merged_df.head())#feature engineering


def convert_size_to_mb(size: str) -> float:
    """
    Convert app size to MB.
    
    Args:
        size: Size string (e.g., '15M', '512K')
    
    Returns:
        Size in MB as float, or NaN if unable to convert
    """
    if pd.isna(size):
        return np.nan
    
    size = str(size)
    
    try:
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'K' in size:
            return float(size.replace('K', '')) / 1024
    except (ValueError, AttributeError):
        pass
    
    return np.nan


def categorize_rating(rating: float) -> str:
    """
    Categorize app rating into groups.
    
    Args:
        rating: Numerical rating value
    
    Returns:
        Rating category string
    """
    if rating >= 4.0:
        return 'Top Rated'
    elif rating >= 3.0:
        return 'Above Average'
    elif rating >= 2.0:
        return 'Average'
    else:
        return 'Below Average'


# Apply feature engineering
apps_df['Size_MB'] = apps_df['Size'].apply(convert_size_to_mb)

apps_df['Reviews'] = (
    apps_df['Reviews']
    .astype(str)
    .str.replace(',', '')
)
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')

# Log transformations for skewed distributions
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])

# Rating categorization
apps_df['Rating_Group'] = apps_df['Rating'].apply(categorize_rating)

# Revenue calculation
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

print("✓ Feature engineering completed")
print(f"\nEngineered Features Summary:")
print(apps_df[['Size_MB', 'Log_Installs', 'Log_Reviews', 'Rating_Group', 'Revenue']].head())
# Initialize VADER Sentiment Analyzer
sentiment_analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text: str) -> float:
    """
    Analyze sentiment of text using VADER.
    
    Args:
        text: Input text to analyze
    
    Returns:
        Compound sentiment score (-1 to 1)
    """
    return sentiment_analyzer.polarity_scores(str(text))['compound']


# Apply sentiment analysis
reviews_df['Sentiment_Score'] = reviews_df['Reviews'].apply(analyze_sentiment)

print("✓ Sentiment analysis completed")
print(f"\nSentiment Score Statistics:")
print(reviews_df['Sentiment_Score'].describe())# Visualization CATEGORY ANALYSIS (TIME-CONTROLLED)
apps_df['Last_Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')

# Filter apps: Rating >= 4.0, Size >= 10MB, Updated in January
filtered_apps = apps_df[
    (apps_df['Rating'] >= 4.0) &
    (apps_df['Size_MB'] >= 10) &
    (apps_df['Last_Updated'].dt.month == 1)
]

# Aggregate by category
category_analysis = (
    filtered_apps
    .groupby('Category')
    .agg({
        'Rating': 'mean',
        'Reviews': 'sum'
    })
    .sort_values('Reviews', ascending=False)
    .head(10)
)

# IST time for time-controlled display
ist_timezone = pytz.timezone('Asia/Kolkata')
current_time_ist = datetime.now(ist_timezone)

# Display bar chart only during 6 AM - 10 PM IST
if 6 <= current_time_ist.hour < 23:
    # Prepare data for visualization
    chart_data = category_analysis.reset_index().melt(
        id_vars='Category',
        value_vars=['Rating', 'Reviews']
    )
    
    fig_bar = px.bar(
        chart_data,
        x='Category',
        y='value',
        color='variable',
        barmode='group',
        title='Top 10 Categories: Rating & Reviews (January Updates)',
        labels={'value': 'Value', 'variable': 'Metric'},
        width=750,
        height=420
    )
    
    fig_bar.update_layout(
        plot_bgcolor='#0a0a0a',
        paper_bgcolor='#0a0a0a',
        font=dict(family='Times New Roman', color='white', size=12),
        title_font=dict(size=16, color='white'),
        hovermode='x unified'
    )
    
    fig_bar.show()
    print("✓ Category analysis chart displayed")
else:
    print(f"ℹ Bar chart available 6 AM - 10 PM IST (Current: {current_time_ist.strftime('%H:%M IST')})") 
#pie chart
app_type_counts = apps_df['Type'].value_counts()

fig_pie = px.pie(
    values=app_type_counts.values,
    names=app_type_counts.index,
    title='App Type Distribution',
    hole=0,
    width=500,
    height=400
)

fig_pie.update_layout(
    plot_bgcolor='#0a0a0a',
    paper_bgcolor='#0a0a0a',
    font=dict(family='Times New Roman', color='white', size=12),
    title_font=dict(size=14, color='white'),
)

fig_pie.update_traces(textposition='inside', textinfo='percent+label')

fig_pie.show()
print("✓ App type distribution pie chart displayed")
# APP RATING DISTRIBUTION (HISTOGRAM)


fig_histogram = px.histogram(
    apps_df,
    x='Rating',
    nbins=20,
    title='App Rating Distribution Analysis',
    labels={'Rating': 'App Rating', 'count': 'Number of Apps'},
    width=600,
    height=450
)

fig_histogram.update_layout(
    plot_bgcolor="#090909",
    paper_bgcolor="#1c1b1b",
    font=dict(family='Times New Roman', color='white', size=12),
    title_font=dict(size=16, color='white'),
    showlegend=False,
    hovermode='x unified',
    xaxis=dict(title_font=dict(size=13), tickfont=dict(size=11)),
    yaxis=dict(title_font=dict(size=13), tickfont=dict(size=11))
)

fig_histogram.update_traces(marker_color="#1f77b4", marker_line_color="#ffffff", marker_line_width=0.5)

fig_histogram.show()
print("✓ Rating distribution histogram displayed")

# ANALYSIS SUMMARY

print("\n" + "=" * 70)
print("COMPREHENSIVE ANALYSIS SUMMARY")
print("=" * 70)
print(f"\n DATASET STATISTICS:")
print(f"   • Total Apps Analyzed: {len(apps_df):,}")
print(f"   • Average Rating: {apps_df['Rating'].mean():.2f}/5.0")
print(f"   • Median Rating: {apps_df['Rating'].median():.2f}/5.0")
print(f"   • Rating Std Dev: {apps_df['Rating'].std():.2f}")

print(f"\n INSTALLATION METRICS:")
print(f"   • Average Installs: {apps_df['Installs'].mean():,.0f}")
print(f"   • Total Installs: {apps_df['Installs'].sum():,.0f}")
print(f"   • Median Installs: {apps_df['Installs'].median():,.0f}")

print(f"\n REVIEW METRICS:")
print(f"   • Total Reviews: {apps_df['Reviews'].sum():,.0f}")
print(f"   • Average Reviews per App: {apps_df['Reviews'].mean():,.0f}")
print(f"   • Median Reviews: {apps_df['Reviews'].median():,.0f}")

print(f"\n SENTIMENT ANALYSIS:")
print(f"   • Average Sentiment Score: {reviews_df['Sentiment_Score'].mean():.3f}")
print(f"   • Sentiment Score Range: [{reviews_df['Sentiment_Score'].min():.3f}, {reviews_df['Sentiment_Score'].max():.3f}]")

print(f"\n REVENUE INSIGHTS:")
print(f"   • Total Revenue Potential: ${apps_df['Revenue'].sum():,.2f}")
print(f"   • Average Revenue per App: ${apps_df['Revenue'].mean():,.2f}")

print("\n" + "=" * 70)
print("✓ Analysis Complete!")
print("=" * 70)


APPS DATA OVERVIEW
                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

  Reviews  Size     Installs  Type Price Content Rating  \
0     159   19M      10,000+  Free     0       Everyone   
1     967   14M     500,000+  Free     0       Everyone   
2   87510  8.7M   5,000,000+  Free     0       Everyone   
3  215644   25M  50,000,000+  Free     0           Teen   
4     967  2.8M     100,000+  Free     0       Everyone   

                      Genres      Last Updated         Current Ver  \
0               Art & Design   January 7, 2018               1.0.0   
1  Ar

✓ Category analysis chart displayed


✓ App type distribution pie chart displayed


✓ Rating distribution histogram displayed

COMPREHENSIVE ANALYSIS SUMMARY

 DATASET STATISTICS:
   • Total Apps Analyzed: 9,366
   • Average Rating: 4.19/5.0
   • Median Rating: 4.30/5.0
   • Rating Std Dev: 0.52

 INSTALLATION METRICS:
   • Average Installs: 17,897,444
   • Total Installs: 167,627,457,938
   • Median Installs: 500,000

 REVIEW METRICS:
   • Total Reviews: 4,814,590,769
   • Average Reviews per App: 514,050
   • Median Reviews: 5,930

 SENTIMENT ANALYSIS:
   • Average Sentiment Score: -0.001
   • Sentiment Score Range: [-0.625, 0.637]

 REVENUE INSIGHTS:
   • Total Revenue Potential: $388,505,986.68
   • Average Revenue per App: $41,480.46

✓ Analysis Complete!
