#### Comprehensive Data Analysis on the pre-processed data

###### 1. Count of the Apps for each category 

In [None]:
# using inbuilt "figure" method of the plot to set figure size
plt.figure(figsize=(20,6))
#title of the plot
plt.suptitle('Count of Apps on App Categories',fontsize = 20)
# ,palette=['#432371',"#FAAE7B","#7fcdbb","#edf8b1","#fc9272","#fee0d2","#bcbddc", "#efedf5"]
#defining color
sns.color_palette("terrain_r", 10)
#counter plot with in the library seaborn
ax = sns.countplot(data=google_apps, x='Category', palette="terrain_r")
#rotating the labels so that they dont appear clumsy
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
# show method to output the plot
plt.show()

In [None]:
# taking sum from the google apps data for grouping by category and then reset the index using "reset_index"
google_apps_sum = google_apps.groupby('Category').sum().reset_index()
# taking mean from the google apps data for grouping by category and then reset the index using "reset_index"
google_apps_mean = google_apps.groupby('Category').mean().reset_index()
# using describe to get statistical summary of the data (for numeric attribute)
google_apps.describe()

###### 2. Analysis on Average App size for categories

In [None]:
# using inbuilt "figure" method of the plot to set figure size
plt.figure(figsize=(20,6))
#title of the plot
plt.suptitle('Average App Size on App Categories',fontsize = 20)
#defining color palette
sns.color_palette("terrain_r", 10)
#using counter plot with in the library seaborn
ax = sns.barplot(data=google_apps_mean, x='Category', y='Size' ,palette="terrain_r")
#rotating the labels so that they dont appear clumsy
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
# show method to output the plot
plt.show()

###### 3. Count for ContentRating types

In [None]:
# using inbuilt "figure" method of the plot to set figure size
plt.figure(figsize=(20,6))
# subplot is used to generate m*n grid with axes in specified location p
plt.subplot(122)
#defining color palette
sns.color_palette("terrain_r", 10)
#using hisplot to generate the plot 
sns.histplot(data=google_apps, x='Content Rating',palette="terrain_r")
#title of the plot
plt.title('Content Distribution')
#showing the generated plot using show method
plt.show()

###### 4. Rating vs Size

Below block of code will give the scatterplot for the Rating vs Size. From the graph,
we can asnswer the questions like,
1. Is Rating, size are correlated 
2. Which range of rating is higely getting for this data 
3. Based on size, what is the range of price to get high rating
Like this we can frame questions and we can intrepret from the plot.

In [None]:
# using inbuilt "figure" method of the plot to set figure size
plt.figure(figsize=(20,6))
# setting y limit
plt.ylim(0,3000000000)
# subplot is used to generate m*n grid with axes in specified location p
plt.subplot(122)
#define color palette
sns.color_palette("terrain_r", 10)
# generating scatterplot
sns.scatterplot(data = google_apps, x= 'Rating', y='Size' ,palette="terrain_r" )
# displaying the plot
plt.show()

###### 5. Reviews vs Installs

Below block of code will give the scatterplot for the Reviews vs Installs. From the graph, we can asnswer the questions like,
Is Reviews, Installs are correlated ?
Is Installs count is effecting the Reviews count?
For this type of questions, we can intrepret from the plot.

In [None]:
# using inbuilt "figure" method of the plot to set figure size
plt.figure(figsize=(20,6))
# subplot is used to generate m*n grid with axes in specified location p
plt.subplot(121)
# generatign a scatterplot
sns.scatterplot(data = google_apps, x= 'Reviews', y='Installs')
# set x axis limit
plt.xlim(0,30000000)
# set y axis lmit
plt.ylim(0,3000000000)
# display the plot
plt.show()

###### 6. Average Rating, Total Installs count on App categories

From the below block code, we can Intrpret the results like 
If an category is having highest average rating, it means if an app will perform good or not?
If an category is having highest Installs count, it means if an app will perform good or not?
Range of average rating is more for most of the categories and 
which category is getting highest and lowest average rating

In [None]:
# using inbuilt "figure" method of the plot to set figure size
plt.figure(figsize=(20,6))
#title of the plot
plt.title('Total Install Count and Average Rating on App Categories',fontsize = 20)
# define color palette
sns.color_palette("terrain_r", 10)
# generating barplot
ax = sns.barplot(data=google_apps_sum, x='Category', y='Installs',palette="terrain_r")
# rotating the lables on x axis so that it doesn't appear clumsy
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
# using twinx to display plot on top of another plot
plt.twinx()
# generating the lineplot 
sns.lineplot(data=google_apps_mean, x='Category', y='Rating')
# defining/labeling legend 
plt.legend(labels=['Average Rating'],fontsize = 20)

###### 7. Effects of the price on Ratings

From the below jointplot code, we can intrepret whether Ratings are effecting the price or viceversa.

In [None]:
#generating a join plot
g=sns.jointplot(x='Rating',y='Price',data=google_apps, color='g')
#title of the plot
plt.title("Price Vs Rating")
#display the plot
plt.show()

###### 8. Count of Rating for each Content Rating

From the plot we can intrepret few questions like, which content rating is having more chances to give rating?, Which content rating apps are more having in the dataset? Is Rating and Content Rating are effecting?

In [None]:
# grouping by content rating and then taking median
newcontent=google_apps.groupby(['Content Rating']).median()
newcontent['Content Rating'] = newcontent.index
# grouping by content rating and then taking count
newcontent_count=google_apps.groupby(['Content Rating']).count()
# setting index
newcontent_count['Content Rating'] = newcontent_count.index
# Setting color condition to high light highest value
clrs = ['green' if (x < max(newcontent_count['App'])) else 'blue' for x in newcontent_count['App'] ]
# generate bar plot
sns.barplot('Content Rating','Rating',data=newcontent_count, palette=clrs)
# display plot
plt.show()

###### 9. Size Distribution Plot

In [None]:
# generating distribution plot 
a=sns.distplot(google_apps['Size'],bins=5,color='g')
# display the plot
plt.plot()

###### 10. Which Category of Apps is having more Installs

In [None]:
From the output of the code, we can conclude that which category is havng highest number of Installs

In [None]:
# group by category with sum of installations sorted by Installs
topInstalls = google_apps.groupby('Category')[['Installs']].sum().sort_values(by= 'Installs', ascending=False)
# displays top 20
topInstalls()

###### 11. Which App's are the top ten most installed in each category

So, we know the which category is having more installs, Now finding which app si having in the category is having more number of installs.

In [None]:
# function to plot bar for top 10 in category
def findtop10incategory(str):
    str = str.upper()
    top10 = google_apps[google_apps['Category'] == str]
    # sort by installation and display top 10
    top10apps = top10.sort_values(by='Installs', ascending=False).head(10)
    # using figure method setting size of the image
    plt.figure(figsize=(13,10))
    # title of the project
    plt.title('Top 10 Installed Apps',size = 15);  
    # setting color palette
    sns.color_palette("terrain_r", 10)
    # generate barplot
#ax = sns.countplot(data=google_apps, x='Category', palette="terrain_r")
    plot = sns.barplot(x = top10apps.App, y = top10apps.Installs,palette="terrain_r" )
    # rotating x labels to make them visible better
    plot.set_xticklabels(plot.get_xticklabels(), rotation= 45, horizontalalignment='right');
# function call with "category" as input or parameter
findtop10incategory('FAMILY')

###### 12. Which apps have the most reviews

From the output of the code, we can conclude which apps are getting more number of reviews and from that we can see In what type of apps users are showing more intrest.

In [None]:
# pivot table with app as index and by taking mean of reviews
highestreviews_app = google_apps.pivot_table('Reviews', index=['App'],aggfunc='mean')
# sort values by reviews
highestreviews_app = highestreviews_app.sort_values(by ='Reviews',ascending=False)
highestreviews_app   

###### 13. How do the app ratings differ between paid and free apps in general?

From this output, we can conclude that Is the ratings are differ when the app is free or paid? Which type of app is better rated? Is type of app is effects the Rating? 

In [None]:
# function to generate bar plot for target attribute with mean
def plot_target_by_group(google_apps, target_col, group_col, figsize=(6,4), title=""):
  
    order = sorted(list(set(google_apps[group_col])))
    stats = google_apps.groupby(group_col).mean()[target_col]
    fig, ax = plt.subplots(figsize=figsize)
    sns.color_palette("terrain_r", 10)
    sns.barplot(x=group_col, y=target_col, data=google_apps, ax=ax, order=order,palette="terrain_r").set_title(title)
    ax.set(ylim=(3.8, 4.5))
    plt.xticks(rotation=90)
    return stats


In [None]:
# using the function created above generated a bar plot between paid and free
stats = plot_target_by_group(google_apps_rating, 'Rating', 'Type', title="Average Rating Groupped by App Type")
for i, s in zip(stats.index, stats):
    print("{} app has average {} {}".format(i, 'Rating',s))
mean_rating = google_apps_rating.Rating.mean()
# display mean
print("Mean rating: {}".format(mean_rating))

###### 14. Rating Distribution

In [None]:
# function to display histogram plot
def plot_hist(google_apps, col, bins=10):

    
    plt.hist(google_apps[col], bins=bins,color = "skyblue")
    plt.xlabel(col)
    plt.ylabel('counts')
    plt.title('Distribution of {}'.format(col))
    
# function to get the sum of free and paid
def compute_app_types(google_apps):

    return sum(google_apps.Type == "Free"), sum(google_apps.Type == 'Paid')
# function to generate polt for app types wrt category
def plot_app_types(google_apps):

    vc_rating = google_apps.Category.value_counts()
    cat_free_apps = []
    cat_paid_apps = []
    for cat in vc_rating.index:
        n_free, n_paid = compute_app_types(google_apps.query("Category == '{}'".format(cat)))
        cat_free_apps.append(n_free)
        cat_paid_apps.append(n_paid)

    f, ax = plt.subplots(2,1)
    ax[0].bar(range(1, len(cat_free_apps)+1), cat_free_apps)
    ax[1].bar(range(1, len(cat_free_apps)+1), cat_paid_apps)

# function to drop categories with less than 10 instaces    
def drop_categories(google_apps):

    vc_rating = google_apps.Category.value_counts()
    cats_to_drop = []
    for cat in vc_rating.index:
        n_free, n_paid = compute_app_types(google_apps.query("Category == '{}'".format(cat)))
        if n_free < 10 or n_paid < 10:
            cats_to_drop.append(cat)
    for cat in cats_to_drop:
        google_apps.drop(google_apps.query('Category == "{}"'.format(cat)).index, axis=0, inplace=True)
    print("Deleted categories: {}".format(cats_to_drop))
    return google_apps

In [None]:
# Histogram to dispaly rate disbution using the fucntion build above
plot_hist(google_apps_rating, 'Rating')

###### 15. How are the differences distributed across different app categories?¶

From the output of the code, we can interpret that, in a particular category at how much rate the free apps are rated higher than the paid apps? In a particular category users are intrested for Paid or Free type.

In [None]:
fig, ax = plt.subplots(figsize=(25,10))
sorted_idx = sorted(paid_apps.index)
rating_diff = paid_apps[sorted_idx] - free_apps[sorted_idx]
# defing the color palette
sns.color_palette("terrain_r", 10)
# generating bar plot
sns.barplot(x=sorted_idx, y=rating_diff, ax=ax, palette="terrain_r").set_title("Difference of Ratings between Paid and Free Apps Across App Categories");
# Rotating x axis labels for better visibility
plt.xticks(rotation=90)
# x label
plt.xlabel('Category')
rating_diff

###### 16.What are the top 25 Apps on the play store that have been downloaded the most times?

From the output or visualization, we can interpret which type of categories app are chances to get average Installation count? Top 25 apps downloaded most in the google play store? 

In [None]:
# give repeated apps or total count of apps
apps = dict(google_apps.App.value_counts())

# using apps dictonary making a new dictionary
apps_dataframe = {
    'App':list(apps.keys()),
    'Count':list(apps.values())
}

# converting to data fram
top_apps = pd.DataFrame(apps_dataframe)
top_apps

In [None]:
# merge above two with common attribute app and then sort with installs and display top 25
highest_25_apps_installed= top_apps.merge(apps_total_installs, on = 'App', how = 'left').sort_values('Installs',ascending = False).head(25)
highest_25_apps_installed

For the better understanding, writing code for the visualization to show above the code output.

In [None]:
# set plot size
plt.figure(figsize=(30,8))
# rotating x axis labels for better visibility
plt.xticks(rotation=100)
# define color palette
sns.color_palette("terrain_r", 10)
# Generate bar plot
sns.barplot(x = highest_25_apps_installed.App, y = highest_25_apps_installed.Installs,palette="terrain_r")
# title of plot
plt.title("Top Apps and their Total Installations",size=30)
# x label
plt.xlabel("Top 25 Apps Downloaded",size=30)
# y lable
plt.ylabel("Total Downloads in  Billions",size=30)
# display plot
plt.show();

###### 17. Which Genre have the most Priced Apps

We know the top genres based on their Installations, so from the output plot we can intrepret does the more installation genre is having the highest Priced Apps or not? Which genre is having the most Priced Apps?

In [None]:
# Sorting based on installs and display top 25
highest_25_genres_installed = highest_25_genres_installed.sort_values('Installs',ascending = False).head(25)
highest_25_genres_installed

For the better unsdertanding the output, using the visualiation to show which genre having the most priced apps

In [None]:
#set plot size
plt.figure(figsize=(30,8))
# rotate x labels
plt.xticks(rotation=100)
# define color palette
sns.color_palette("terrain_r", 10)
# generate barplot
sns.barplot(x = highest_25_genres_installed.Genres, y = highest_25_genres_installed.Installs , palette="terrain_r")
#title of the plot
plt.title('Top 25 Genres besed on their total Installations',size=25)
# x label
plt.xlabel('Top 25 Genres',size=25)
# y label
plt.ylabel('Total Installations Count in Billions',size=25);

###### 18. which category have the most priced apps

This output is like the above output but here we can conclude for the apps. so from the output plot we can intrepret does the more installation app is having the highest Priced Apps or not? Which category of. app is having the most Priced Apps?

In [None]:
# taking count of category
dict1 = dict(google_apps.Category.value_counts())

# making dictonary using above dictonary
dataframe = {'Category':list(dict1.keys())}

# Converting the dictionary above into a dataframe
top_Category = pd.DataFrame(dataframe)
top_Category

In [None]:
# dataframe with category and price attributes
top_Category_accordingto_price = google_apps[['Category','Price']]   

# group by category and sum of price
top_Category_accordingto_price = top_Category_accordingto_price.groupby(['Category'])[['Price']].sum()

In [None]:
# Merging above two dataframes
final_dataframe = top_Category.merge(top_Category_accordingto_price, on = 'Category', how = 'left').sort_values('Price', ascending = False).head(25)
final_dataframe

In [None]:
# set plot size
plt.figure(figsize=(30,8))
# rotate x axis lables
plt.xticks(rotation=100)
# generate a bar plot
sns.barplot(x = final_dataframe.Category, y = final_dataframe.Price, palette="terrain_r")
# x label
plt.xlabel('Top 25 Category',size=25)
# y lable
plt.ylabel('Sum of Prices of Apps in USD',size=25)
# title of the plot
plt.title('Top 25 Category with maximum sum of price for the apps',size=25);
plt.rc('font', size=20)          # controls default text sizes
plt.rc('axes', titlesize=20)     # fontsize of the axes title
plt.rc('axes', labelsize=20)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=20 )    # fontsize of the tick labels
plt.rc('ytick', labelsize=20)    # fontsize of the tick labels
plt.rc('legend', fontsize=20 )    # legend fontsize
plt.rc('figure', titlesize=20)  # fontsize of the figure title
# display plot
plt.show()

###### 19. Which Genre having the Most and Least Ratings?

From the output, we can intrepret that which type of genres are most loved by all. 

In [None]:
# group by genre and take mean of rating and sort the values
mean_ratings_accto_genres = google_apps.groupby(['Genres'])[['Rating']].mean().sort_values('Rating',ascending = False)

cols = list(google_apps.Genres.unique())

dict2 = {
    'Genres':cols
}
# conver to data frame
df = pd.DataFrame(dict2)
# merge above to data frames
mean_ratings_accto_genres = mean_ratings_accto_genres.merge(df, on = 'Genres',how = 'left')
mean_ratings_accto_genres

In [None]:
# display top 10
highest_Ratings_top_10 = mean_ratings_accto_genres.head(10)
highest_Ratings_top_10

In [None]:
# set plot figure size
plt.figure(figsize=(30,8))
# rotating x labels
plt.xticks(rotation=100)
# generate bar plot 
sns.barplot(x = highest_Ratings_top_10.Genres, y = highest_Ratings_top_10.Rating, palette="terrain_r")
# x labels
plt.xlabel('Top 10 Genres with highest ratings',size=25)
# y labels with size
plt.ylabel('Average Ratings (out of 5)',size=25)
# title with size
plt.title('Genres v/s Average Ratings',size=25);

In [None]:
# display top 10
lowest_Ratings_top_10 = mean_ratings_accto_genres.tail(10)
lowest_Ratings_top_10

In [None]:
# set the plot figure size
plt.figure(figsize=(30,8))
# rotate x lables
plt.xticks(rotation=100)
# generate bar plot
sns.barplot(x = lowest_Ratings_top_10.Genres, y = lowest_Ratings_top_10.Rating, palette="terrain_r")
# x lables with size
plt.xlabel('Top 10 Genres with lowest average ratings',size=25)
# y lables with size
plt.ylabel('Average Ratings (out of 5)',size=25)
# title of plot with size
plt.title(' Top 10 Genres v/s Average Ratings',size=25);

###### 20. Which Category having the Most and Least Ratings?

From the output, we can intrepret that which type of categories are most loved by all.

In [None]:
# group by category and take mean of rating and sort values into df
mean_ratings_accto_Category = google_apps.groupby(['Category'])[['Rating']].mean().sort_values('Rating',ascending = False)

cols = list(google_apps.Category.unique())

dict3 = {
    'Category':cols
}
# conver to dataframe
df1 = pd.DataFrame(dict3)
# merge both df's
mean_ratings_accto_Category = mean_ratings_accto_Category.merge(df1, on = 'Category',how = 'left')
mean_ratings_accto_Category

In [None]:
# display top 10
highest_Ratings_top_10_Category = mean_ratings_accto_Category.head(10)
highest_Ratings_top_10_Category

In [None]:
# set the plot figure size
plt.figure(figsize=(30,8))
# x lables rotation
plt.xticks(rotation=100)
# generate bar plot
sns.barplot(x = highest_Ratings_top_10_Category.Category, y = highest_Ratings_top_10_Category.Rating, palette="terrain_r")
# x labels with size
plt.xlabel('Top 10 Category with highest ratings',size=25)
# y label with size
plt.ylabel('Average Ratings (out of 5)',size=25)
# title of the plaot with size
plt.title('Category v/s Average Ratings',size=25);

In [None]:
# display bottomw 10
lowest_Ratings_top_10_Category = mean_ratings_accto_Category.tail(10)
lowest_Ratings_top_10_Category

In [None]:
# set the size for the plot figure
plt.figure(figsize=(30,8))
# rotate x labels
plt.xticks(rotation=100)
# generate bar plot
sns.barplot(x = lowest_Ratings_top_10_Category.Category, y = lowest_Ratings_top_10_Category.Rating, palette="terrain_r")
# x lables with size
plt.xlabel('Top 10 Category with lowest ratings',size=25)
# y label with size
plt.ylabel('Average lowest Ratings (out of 5)',size=25)
# title of the plot with size
plt.title('Category v/s Average lowest Ratings',size=25);

###### 21. What are the number of apps in different categories with respect to their type(Free/paid) ?

From the output we can conclude that Is Free apps or Paid apps are more count in each category?

In [None]:
# df with group by category and then take count and reset the index
x = google_apps.groupby(['Category','Type'])[['App']].count().reset_index()

# df wiht just apps free and count
x1 = x[x['Type'] == 'Free'].rename(columns={'App':'Free'}).drop('Type',axis = 1)

# df wiht just apps paid and count
x2 = x[x['Type'] == 'Paid'].rename(columns={'App':'Paid'}).drop(columns=['Type'])

Here, reset_index() converts the dataframe object into a pure dataframe by adding the index label column. 

In [None]:
# above two df's are merged
total_apps_count_accto_category_df = x1.merge(x2,on = 'Category',how = 'left').fillna(0)

In [None]:
# generate bar plot
total_apps_count_accto_category_df.set_index('Category').plot(kind = 'bar', stacked = False, figsize = (30,7));
# x label
plt.xlabel("Category")
#y label
plt.ylabel('Counts of apps w.r.t their type')
# title
plt.title("Categories of apps with respect to their Type vs count");

###### 22. In what proportion do App Categories get installed

In [None]:
# generate a pie chat for category with percentages
google_apps.groupby(['Category']).sum().plot(
    kind='pie', y='Installs',autopct='%1.0f%%')
plt.legend(loc='center right', bbox_to_anchor=(2,0.5))

###### 23.Does more number of apps in a category have any impacts on installs?

We can conclude from the graph, whether the number of apps in a category impacts on installs or not?

In [None]:
# gorup by category with app count and install sum
Installs_impact=google_apps.groupby('Category').agg(App_count=('App','count'),Install_count=('Installs','sum'))
# taking unique values
sort_category = google_apps['Category'].unique()
# sort
sort_category.sort()
sort_category
Installs_impact['Category']= sort_category
# display top 5
Installs_impact.head()

###### 24. Percentage of Free and Paid apps

The output concludes which type of apps are mostly used in the Google play store dataset

In [None]:
# % of the paid and free
totalapp_free_df = (google_apps[google_apps.Type == 'Free'].count().iloc[0] / google_apps['App'].count() * 100).round(2)
totalapp_paid_df = (google_apps[google_apps.Type == 'Paid'].count().iloc[0] / google_apps['App'].count() * 100).round(2)
plt.figure(figsize=(6,6,))
# labels
labels = 'Free Applications', 'Paid Applications'
sizes = [93.11,6.89]
explode = (0.2,0)
colors = ['pink','darkorange']
# generate pie plot
plt.pie(sizes, explode=explode,labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
# display pie plot
plt.show()

In [None]:
fig, ax_left = plt.subplots()
# x labels
ax.set_xlabel(r'Category')
# plot on top of plot
ax_right = ax_left.twinx()

# plot and label , legends for left side
ax_left.plot(Installs_impact['Category'], Installs_impact['App_count'], color='red', label='App Count')
ax_left.set_ylabel('Apps count')
ax_left.legend(loc=2)

# plot and label , legends for right side
ax_right.plot(Installs_impact['Category'], Installs_impact['Install_count'], color='green', label='Installs Count')
ax_right.set_ylabel('Installs count')
ax_right.legend()
fig.autofmt_xdate(rotation=90)

ax.grid(False)
# remove grid
plt.grid(False)
# display grid
plt.show()

###### 25. Number of Reviews as percent of Installs

We can intrepret, The Number of Reviews are impacts the Installs of tha Apps from the graph

In [None]:
# group by category and then count of apps then sum of reviews and then count of installs
RI=google_apps.groupby('Category').agg(App_count=('App','count'),Reviews_count=('Reviews','sum'),Install_count=('Installs','sum'))
print(RI)

In [None]:
# taling uniques
x = google_apps.Category.unique()
# sort
x.sort()
x
RI['Category']= x
RI

In [None]:
fig, host = plt.subplots(figsize=(8,5))

par1 = host.twinx()
par2 = host.twinx()
# limits for x and y   
host.set_xlim(0, 5)
host.set_ylim(0, 2000)
par1.set_ylim(0, 523423792)
par2.set_ylim(2, 11532464253)
# labels for x and y    
host.set_xlabel("Category")
host.set_ylabel("App_count")
par1.set_ylabel("Reviews_count")
par2.set_ylabel("Install_count")

color1 = plt.cm.viridis(0)
color2 = plt.cm.viridis(0.5)
color3 = plt.cm.viridis(.9)

p1, = host.plot(RI['Category'], RI['App_count'],    color=color1, label="App count")
p2, = par1.plot(RI['Category'], RI['Reviews_count'],    color=color2, label="Reviews count")
p3, = par2.plot(RI['Category'], RI['Install_count'], color=color3, label="Install count")
                
                
lns = [p1, p2, p3]
host.legend(handles=lns, loc='best')

# right, left, top, bottom
par2.spines['right'].set_position(('outward', 60))

# no x-ticks                 
par2.xaxis.set_ticks(RI['Category'])
# rotate labels
fig.autofmt_xdate(rotation=90)
# remove grid
plt.grid(False)

###### 26. Is Reviews and Ratings contradict each other?

We can Intrepret from the output,Is reviews and ratings are contradict each other? It means we are very confused sometimes to go with the app which is having more number of ratings factor or more number of good reviews, so for that we are analysing this code.

In [None]:
# group by category and then mean of rating and mean of reviews, mean of installs
Reviews_Rating_Contradict=google_apps.groupby('Category').agg(Avg_Rating=('Rating','mean'),Reviews_count=('Reviews','mean'),Install_count=('Installs','mean'))
# taking unique values
categorycol = google_apps.Category.unique()
# sort values
categorycol.sort()
categorycol
Reviews_Rating_Contradict['Category']= categorycol
Reviews_Rating_Contradict

In [None]:
# three bar plots generation
plt.bar(Reviews_Rating_Contradict['Category'],Reviews_Rating_Contradict['Install_count'],color=color1, label = 'Install_count')
plt.bar(Reviews_Rating_Contradict['Category'],Reviews_Rating_Contradict['Reviews_count'],color=color2, label = 'Reviews_count')
plt.bar(Reviews_Rating_Contradict['Category'],Reviews_Rating_Contradict['Avg_Rating'],color=color3, label = 'Average Rating')
plt.legend()
# x lable
plt.xlabel('Category')
# rotate x label
plt.xticks(rotation=90)
# remove grid
plt.grid(False)
# display plot
plt.show()

###### 27. Examining how many installs an app receives, in relation to its size and version?¶

We can intrepret at what range of size and version more number of apps getting installed. From the first visualization, it shows for which android versions the count of Installations apps getting higest. Second visualization shows, this highest installation count will be impacting the size. If Yes at what range of the size is
considerable to install an app.

In [None]:
# generate join plot
sns.jointplot(x=google_apps['Installs'],y=google_apps['Size'],kind="reg", space=0, color="g")
# display the plot
plt.show()

###### 28. Heatmap of all numerical variables to get an overview of relationships between them

In [None]:
f,ax = plt.subplots(figsize=(10,8))
# generate head map for correlation
sns.heatmap(google_apps.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
# title of the plot and size
plt.title("The correlation of all variables",size=20) 
# display the plot
plt.show()

###### 29. Rating Vs Reviews

In [None]:
# set the size of the plot figure
plt.figure(figsize = (10,10))
# generate plot
sns.regplot(x="Reviews", y="Rating", color = 'Blue',data=google_apps[google_apps['Reviews']<1000000]);
# title of the plot with size
plt.title('Rating VS Reveiws',size = 20)

###### 30. Number of reviews for each category

The output says which category is having more number of reviews. 

In [None]:
# group by category and take mean of reviews then index rest and then sort values
result1 = google_apps.groupby(["Category"])['Reviews'].aggregate(np.mean).reset_index().sort_values('Reviews')

plt.subplots(figsize = (18,8))
# rotate x label
plt.xticks(rotation = 90)
# generate plot
sns.barplot('Category','Reviews', ci=None, data = google_apps, order=result1['Category'],palette="terrain_r")
# title of the plot
plt.title('Number of Reviews for Each Category',size=25)
# xlabel with size
plt.xlabel('Category',size=25)
# display plot
plt.show()

###### 31. Average Reviews by Application Installations

In [None]:
# group by install and means of reviews and then index reset and sort by reviews
result3 = google_apps.groupby(["Installs"])['Reviews'].aggregate(np.mean).reset_index().sort_values('Reviews')
#set the size of the figure for plot
plt.figure(figsize=(20,10))
#generate plot
sns.barplot(x=google_apps.Installs, y=google_apps.Reviews,ci=None,order=result3['Installs'],palette="terrain_r")
# rotation of x labels
plt.xticks(rotation = 45)
# x label
plt.xlabel('Application Installs')
# y lable
plt.ylabel('Reviews')
# tile with size for plot
plt.title('Average Reviews by Application Installations',size=20)
# display plot
plt.show()

###### 32. Rating Vs Size

We can conclude that, at which size brackets the rating will be high

In [None]:
# set size of the figure in the plot
plt.figure(figsize = (12,12))
# genereate join plot with configuration
size = sns.jointplot(x="Size", y="Rating",color = 'darkorange', data=google_apps, size = 8, kind="reg")

###### 33. Pairwise plot of Numeric Features based on Free or Paid apps

From the pairplot, we can Comapre the free and paid apps in all aspects like which type of apps are tend to get more ratings or reviews.

In [None]:
# remove na's
rating = google_apps['Rating'].dropna()
size = google_apps['Size'].dropna()
installs = google_apps['Installs'][google_apps.Installs!=0].dropna()
reviews = google_apps['Reviews'][google_apps.Reviews!=0].dropna()
type = google_apps['Type'].dropna()
# generate pair plot with all the configuration for 'Rating', 'Installs', 'Size','Reviews', 'Type'
pairplot = sns.pairplot(pd.DataFrame(list(zip(rating, np.log10(installs), size, np.log10(reviews), type)),
                                     columns=['Rating', 'Installs', 'Size','Reviews', 'Type']), hue='Type', palette="Set2")

##### 34. Boxplot of Price VS Rating based on PriceBand

From the boxplot we can interpret, at which priceband the ratings are low and at which Priceband they are high

In [None]:
# creating a price groups or levels
google_apps.loc[google_apps['Price'] == 0, 'PriceBand'] = '0 Free'
google_apps.loc[(google_apps['Price'] > 0) & (google_apps['Price'] <= 0.99), 'PriceBand'] = '1 Cheap'
google_apps.loc[(google_apps['Price'] > 0.99) & (google_apps['Price'] <= 2.99), 'PriceBand']   = '2 Not Too Cheap'
google_apps.loc[(google_apps['Price'] > 2.99) & (google_apps['Price'] <= 4.99), 'PriceBand']   = '3 Normal'
google_apps.loc[(google_apps['Price'] > 4.99) & (google_apps['Price'] <= 14.99), 'PriceBand']   = '4 Not Too Expensive'
google_apps.loc[(google_apps['Price'] > 14.99) & (google_apps['Price'] <= 29.99), 'PriceBand']   = '5 Expensive'
google_apps.loc[(google_apps['Price'] > 29.99), 'PriceBand']  = '6 Too Expensive'

In [None]:
# group by PriceBand and mean of rating
google_apps[['PriceBand', 'Rating']].groupby(['PriceBand'], as_index=False).mean()

In [None]:
# generate cat plot
price = sns.catplot(x="PriceBand",y="Rating",data=google_apps, kind="box", height = 10 ,palette = "Pastel1",
                    order=["0 Free", "1 Cheap","2 Not Too Cheap", "3 Normal", "4 Not Too Expensive", "5 Expensive", "6 Too Expensive"])
price.despine(left=True)
# rotate x labels
price.set_xticklabels(rotation=90)
# y label
price.set_ylabels("Rating")
# title of the plot
plt.title('Boxplot Rating VS Price',size = 20)
# display plot
plt.plot()

##### 35. Boxenplot of Rating VS Content Rating based on ContentRatingBand

Based on the boxenplot we get, which contentrating category applications got highest rating

In [None]:
#creating content rating groups or levels
google_apps.loc[google_apps['Content Rating'] == 'Everyone', 'ContentRatingBand'] = 'Everyone'
google_apps.loc[(google_apps['Content Rating'] == 'Teen'), 'ContentRatingBand'] = 'Teen'
google_apps.loc[(google_apps['Content Rating'] == 'Everyone 10+'), 'ContentRatingBand'] = 'Everyone 10+'
google_apps.loc[(google_apps['Content Rating'] == 'Mature 17+'), 'ContentRatingBand'] = 'Mature 17+'
google_apps.loc[(google_apps['Content Rating'] == 'Adults only 18+'), 'ContentRatingBand'] = 'Adults only 18+'
google_apps.loc[(google_apps['Content Rating'] == 'Unrated'), 'ContentRatingBand'] = 'Unrated'

In [None]:
# group by ContentRatingBand and mean of rating
google_apps[['ContentRatingBand', 'Rating']].groupby(['ContentRatingBand'], as_index=False).mean()

In [None]:
# generate catplot
price = sns.catplot(x="Content Rating",y="Rating",data=google_apps, kind="boxen", height = 10 ,palette = "Pastel1",
                    order=["Adults only 18+", "Everyone", "Teen", "Everyone 10+", "Mature 17+","Unrated"])
price.despine(left=True)
# rotate x lable
price.set_xticklabels(rotation=90)
# y label
price.set_ylabels("Rating")
# title of the plot
plt.title('Boxenplot Rating VS Content Rating',size = 20)
#disply plot
plt.plot()

#### 36.Pieplot for percentage of reviews for each content rating

From the Pieplot, we can see that which content rating category wrote lot of the reviews.

In [None]:
labels=google_apps['Content Rating'].unique()
explode = (0.2, 0, 0,0)
size=list()
# set lables
for content in labels:
    size.append(google_apps[google_apps['Content Rating']==content]['Reviews'].mean())
# set figure size
plt.figure(figsize=(10,10))
# set colors
colors = ['pink','red','lightgreen','orange', 'lightblue','yellow']
# generate pie plot
plt.pie(size, explode=explode, labels=labels, colors=colors,autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.axis('equal')
# title of the plot
plt.title('Percentage of Reviews for each Content Rating',size = 20)
# display plot
plt.show()

##### 37. Boxenplot of rating vs genres

From the boxenplot we cannot interpret whether the genres have strong relationship with ratings or not.

In [None]:
# group by genre and mean of rating and sort by rating
google_apps[['Genres', 'Rating']].groupby(['Genres'], as_index=False).mean().sort_values('Rating')

In [None]:
# describe - statistical summary
google_apps[['Genres', 'Rating']].groupby(['Genres'], as_index=False).mean().describe()

In [None]:
genre = sns.catplot(x="Genres",y="Rating",data=google_apps, kind="boxen", height = 18 ,palette = "Paired")
genre.despine(left=True)
#rotating x labels
genre.set_xticklabels(rotation=90)
# y label
genre = genre.set_ylabels("Rating")
# title of the boxenplot
plt.title('Boxenplot of Rating VS Genres',size = 20)

##### 38. Barplot of Android Version after grouping by Rating.

From this plot, we can see whether we can interpret the relationship between the rating and Android version.

In [None]:
result5 = google_apps.groupby(["Android Ver"])['Rating'].aggregate(np.mean).reset_index().sort_values('Rating')
#set size of the plot figure
plt.figure(figsize=(20,10))
#generate bar plot
sns.barplot(x=google_apps['Android Ver'], y=google_apps.Rating, ci=None, order=result5['Android Ver'],palette="terrain_r")
#rotating x labels
plt.xticks(rotation = 45)
# y limit
plt.ylim(3.5,4.6)
# x label
plt.xlabel('Android Version of Applications',size=20)
# y label
plt.ylabel('Rating',size=20)
# title of plot
plt.title('Average Ratings by Android Version',size=20)
 # shwow the plot
plt.show()

##### 39. Which Category has maximum number of reviews submitted on Google Play Store

From this we can say which category is getting more number of reviews.

In [None]:
# there are some negative signs before('-') we will remove them 
def positive(column):
    for i in column:
        if i>0:
            return i
        else:
            return i*(-1)
google_apps['Reviews'] = google_apps[['Reviews']].apply(positive,axis=1)

In [None]:
category_reviews = google_apps['Reviews'].groupby(by = google_apps['Category']).sum().sort_values(ascending =False)
px.bar(google_apps, x = category_reviews.index, y = category_reviews.values, color = category_reviews.index,title="Maximum number of Reviews for Categories").update_layout(
    xaxis_title="Category",title_x=0.5, yaxis_title="Reviews count")