# Exploring Data

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('dataset.csv')

In [7]:
# get the first 5 rows of the spreadsheet
print(df.head())

                                                Name  \
0                      10-Day Green Smoothie Cleanse   
1                                  11/22/63: A Novel   
2            12 Rules for Life: An Antidote to Chaos   
3                             1984 (Signet Classics)   
4  5,000 Awesome Facts (About Everything!) (Natio...   

                     Author  User Rating  Reviews  Price  Year        Genre  
0                  JJ Smith          4.7    17350      8  2016  Non Fiction  
1              Stephen King          4.6     2052     22  2011      Fiction  
2        Jordan B. Peterson          4.7    18979     15  2018  Non Fiction  
3             George Orwell          4.7    21424      6  2017      Fiction  
4  National Geographic Kids          4.8     7665     12  2019  Non Fiction  


In [8]:
# get the shape of the spreadsheet
print(df.shape)

(550, 7)


In [None]:
# get the column names of the spreadsheet
print(df.columns)

Index(['Name', 'Author', 'User Rating', 'Reviews', 'Price', 'Year', 'Genre'], dtype='object')


In [10]:
# get summary statistics for each column
print(df.describe())

       User Rating       Reviews       Price         Year
count   550.000000    550.000000  550.000000   550.000000
mean      4.618364  11953.281818   13.100000  2014.000000
std       0.226980  11731.132017   10.842262     3.165156
min       3.300000     37.000000    0.000000  2009.000000
25%       4.500000   4058.000000    7.000000  2011.000000
50%       4.700000   8580.000000   11.000000  2014.000000
75%       4.800000  17253.250000   16.000000  2017.000000
max       4.900000  87841.000000  105.000000  2019.000000


# Cleaning Data

In [None]:
# dropping duplicates
df = df.drop_duplicates()
# no duplicates in this dataset

In [14]:
# rename the columns of the df to make them more descriptive and easier to work with
df = df.rename(columns={"Name": "Title", "Year": "Publication Year", "User Rating": "Rating"})
print(df.columns)

Index(['Title', 'Author', 'Rating', 'Reviews', 'Price', 'Publication Year',
       'Genre'],
      dtype='object')


In [None]:
# convert the "Price" column to a float data type to make it easier to work with
df["Price"] = df["Price"].astype(float)

# Running Analysis

In [16]:
# analyzing Author Popularity
author_counts = df['Author'].value_counts()
print(author_counts)

Author
Jeff Kinney                           12
Gary Chapman                          11
Rick Riordan                          11
Suzanne Collins                       11
American Psychological Association    10
                                      ..
Keith Richards                         1
Chris Cleave                           1
Alice Schertle                         1
Celeste Ng                             1
Adam Gasiewski                         1
Name: count, Length: 248, dtype: int64


In [17]:
# average Rating by Genre
avg_rating_by_genre = df.groupby("Genre")["Rating"].mean()
print(avg_rating_by_genre)

Genre
Fiction        4.648333
Non Fiction    4.595161
Name: Rating, dtype: float64


# Exporting Results

In [None]:
# export all top selling authors to a CSV file in descending order
author_counts.to_csv("results/top_authors.csv")

# export average rating by genre to a CSV file
avg_rating_by_genre.to_csv("results/avg_rating_by_genre.csv")