# About Data 
Google_play_Store_Data

# Objectives
- To clean and understand the basic structure of the Google Play Store dataset.
- To explore and visualize trends in app categories, ratings, installs, and pricing.
- To identify top apps and developers and analyze relationships between key features.


# kernel Used
3.13.5

 # Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


Data Loading and Exploration| Cleaning

In [None]:
data=pd.read_csv(r"\Users\Assasin\OneDrive\Desktop\Project1\venv\googleplaystore.csv")

Top Five Rows of the Data

In [None]:
data.head(5)

In [None]:
#Set Options to be maximum For rows and columns
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

Handles Warnings 

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
print(f"The name of the column in Dataset are:{data.columns}")

In [None]:
print(f"The total numbers of rows are:{data.shape[0]}")
print(f"The total numbers of Columns are:{data.shape[1]}")

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.head(5)

How To Make Size a Numeric Column

In [None]:
data['Size'].unique()

Observations

1.Varies With device
2.M
3.K

In [None]:
data['Size'].isnull().sum()

No Missing values in Size 

In [None]:
#find the values having M in them
data['Size'].loc[data['Size'].str.contains('M','m')].value_counts().sum()

In [None]:
#find the value having varies with device in them
data['Size'].loc[data['Size'].str.contains('Varies with device','varies with device')].value_counts().sum()

In [None]:
#find the value having k in them
data['Size'].loc[data['Size'].str.contains('k','K')].value_counts().sum()

In [None]:
data.shape

In [None]:
data['Size'].isnull().sum()

Converting Whole Size in numeric

In [None]:
#making a function 
def convert_size(Size):
    if isinstance(Size,str):
        if 'M' in Size:
             return float(Size.replace("M",""))*1024*1024
        elif'k' in Size:
            return float(Size.replace("k",""))*1024
        elif"Varies with device" in Size:
            return np.nan
    return Size

In [None]:
data["Size"] = data["Size"].apply(convert_size)

In [None]:
data.head(3)

In [None]:
data["Size"]

In [None]:
#Rename The Column 
data.rename(columns={"Size":"Size_in_Bytes"},inplace=True)

In [None]:
data.head(2)

Now let's Take care Of Installs

In [None]:
data['Installs'].unique()

In [None]:
data['Installs'].isnull().sum()

Installs Column:
1.Remove + Sign 
2.Remove ,
3.Convert it into integer


In [None]:
data['Installs']=data['Installs'].apply(lambda x:x.replace('+','') if '+'in str(x) else x)

In [None]:
data['Installs']=data['Installs'].apply(lambda x:x.replace(',','') if ','in str(x) else x)

In [None]:
#We have one value "Free" so we replcae it with 0
data['Installs'] = data['Installs'].replace("Free", "0")
data['Installs'] = data['Installs'].astype(int)

In [None]:
data['Installs']=data['Installs'].apply(lambda x:int(x))

In [None]:
data['Installs'].value_counts()

In [None]:
data.describe()

Price Column

In [None]:
data['Price'].value_counts()

In [None]:
#Let's Have a Look how many values have $ sign 
data["Price"].loc[data["Price"].str.contains("$")].value_counts().sum()

In [None]:
data['Price'] = data['Price'].str.replace('\$', '', regex=True)
data['Price'] = data['Price'].replace("Everyone", "0")
data['Price'] = data['Price'].astype(float)


In [None]:
data['Price'].value_counts()

In [None]:
data['Price']=data['Price'].apply(lambda x:float(x))

In [None]:
data["Price"].value_counts()

In [None]:
data['Price'] = pd.to_numeric(data['Price'], errors='coerce').astype(float)

In [None]:
data["Price"].value_counts()

In [None]:
data.describe()

Reviews Column

In [None]:
data["Reviews"].value_counts()

In [None]:
data["Reviews"].isnull().sum()

Making Reviews Numeric

In [None]:
data['Reviews'] = pd.to_numeric(data['Reviews'], errors='coerce')

In [None]:
data.describe()

In [None]:
#Print Min,max and avg values of apps
print(f"The Min Price is:{data["Price"].min()}")
print(f"The Max Price is:{data["Price"].max()}")
print(f"The avg Price is:{data["Price"].mean()}")

Missing Values 

In [None]:
data.isnull().sum()

In [None]:
data['Size_in_Bytes'] = pd.to_numeric(data['Size_in_Bytes'], errors='coerce')

Filling Null vales with mean in Size Column 

In [None]:
data['Size_in_Bytes'] = data['Size_in_Bytes'].fillna(data['Size_in_Bytes'].median())

In [None]:
data['Size_in_Bytes'].isnull().sum()

Now we will Handle Reviews column:

In [None]:
data['Rating'] = data['Rating'].fillna(data['Rating'].median())

In [None]:
data["Rating"].isnull().sum()

Droping Multiple Column Missing values beacuse we have a very Low missing vallues:

In [None]:
data = data.dropna(subset=['Type', 'Content Rating', 'Current Ver', 'Android Ver'])

In [None]:
data.isnull().sum()

In [None]:
data['Installs'].value_counts().head(5)

Age Vise Rating

In [None]:
data['Content Rating'].value_counts()

Free Vs Paid Apps

In [None]:
data['Type'].value_counts()

Reviews Analysis

In [None]:
print("Max Reviews:", data['Reviews'].max())
print("Average Reviews:", data['Reviews'].mean())

Apps Size in Bytes

In [None]:
print("Average Size (bytes):", data['Size_in_Bytes'].mean())
print("Max Size:", data['Size_in_Bytes'].max())
print("Min Size:", data['Size_in_Bytes'].min())

Correlation between Installs and Reviews 

In [None]:
correlation = data['Reviews'].corr(data['Installs'])
print("Correlation between Reviews and Installs:", correlation)

In [None]:
# Correlation between Price and Installs
correlation = data['Price'].corr(data['Installs'])
print("Correlation between Price and Installs:", correlation)

In [None]:
# Correlation between Rating and Installs
correlation = data['Rating'].corr(data['Installs'])
print("Correlation between Rating and Installs:", correlation)

# Data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Top 10 categories
top_categories = data['Category'].value_counts().head(10)
top_categories.plot(kind='bar', figsize=(10,5))
plt.title("Top 10 App Categories on Google Play Store")
plt.xlabel("Category")
plt.ylabel("Number of Apps")
plt.show()


# Free vs Paid Apps

In [None]:
sns.countplot(x='Type', data=data)
plt.title("Free vs Paid Apps")
plt.show()


# Rating vs Category (Boxplot)

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x='Category', y='Rating', data=data)
plt.xticks(rotation=90)
plt.title("App Ratings by Category")
plt.show()


# Top 10 Most Expensive Paid Apps

In [None]:
top_expensive = data[data['Type']=='Paid'].sort_values(by='Price', ascending=False).head(10)
sns.barplot(x='Price', y='App', data=top_expensive)
plt.title("Top 10 Most Expensive Paid Apps")
plt.xlabel("Price ($)")
plt.ylabel("App Name")
plt.show()
