In [1]:
# importing the modules or libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt

In [2]:
# reading the csv file
df = pd.read_csv('../Downloads/audible/audible_uncleaned.csv')

In [3]:
# checking the first 10 rows
df.head(10)

Unnamed: 0,name,author,narrator,time,releasedate,language,stars,price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,2 hrs and 20 mins,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 8 mins,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,Writtenby:JeffKinney,Narratedby:DanRussell,2 hrs and 3 mins,06-11-20,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,Writtenby:RickRiordan,Narratedby:SoneelaNankani,11 hrs and 16 mins,05-10-21,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",Writtenby:RickRiordan,Narratedby:JesseBernstein,10 hrs,13-01-10,English,4.5 out of 5 stars181 ratings,820.0
5,The Hunger Games: Special Edition,Writtenby:SuzanneCollins,Narratedby:TatianaMaslany,10 hrs and 35 mins,30-10-18,English,5 out of 5 stars72 ratings,656.0
6,Quest for the Diamond Sword,Writtenby:WinterMorgan,Narratedby:LukeDaniels,2 hrs and 23 mins,25-11-14,English,5 out of 5 stars11 ratings,233.0
7,The Dark Prophecy,Writtenby:RickRiordan,Narratedby:RobbieDaymond,12 hrs and 32 mins,02-05-17,English,5 out of 5 stars50 ratings,820.0
8,Merlin Mission Collection,Writtenby:MaryPopeOsborne,Narratedby:MaryPopeOsborne,10 hrs and 56 mins,02-05-17,English,5 out of 5 stars5 ratings,1256.0
9,The Tyrant’s Tomb,Writtenby:RickRiordan,Narratedby:RobbieDaymond,13 hrs and 22 mins,24-09-19,English,5 out of 5 stars58 ratings,820.0


In [4]:
# checking the shape of our dataset
df.shape
print('Number of columns', df.shape[0])
print('Number of rows', df.shape[1])

Number of columns 87489
Number of rows 8


In [5]:
# checking the information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87489 entries, 0 to 87488
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         87489 non-null  object
 1   author       87489 non-null  object
 2   narrator     87489 non-null  object
 3   time         87489 non-null  object
 4   releasedate  87489 non-null  object
 5   language     87489 non-null  object
 6   stars        87489 non-null  object
 7   price        87489 non-null  object
dtypes: object(8)
memory usage: 5.3+ MB


## Data Cleaning steps

- Changing of columns headers to proper case
- Changing the datatypes (time, releasedate and price)
- Clean author and narrator columns
- Spliting the stars column into Stars and Ratings
- Change the new column's datatype

In [6]:
# Changing the column headers to proper case
df.columns = df.columns.str.title()

In [7]:
# Renamed the time column to time_mins
df.rename({"Time":"Time_mins"},axis=1, inplace=True)

In [8]:
# Function to convert time strings to minutes
def convert_to_minutes(time_str):
    parts = time_str.split('and')
    hours_minutes = [part.strip().split() for part in parts]
    total_minutes = 0
    for part in hours_minutes:
        if 'hrs' in part[1]:
            total_minutes += int(part[0]) * 60
        elif 'mins' in part[1]:
            total_minutes += int(part[0])
    return total_minutes

# Convert 'Time_mins' column to minutes
df['Time_mins'] = df['Time_mins'].apply(convert_to_minutes)
 

In [9]:
# Checking the effectiveness of the conversion
df.head(5)

Unnamed: 0,Name,Author,Narrator,Time_mins,Releasedate,Language,Stars,Price
0,Geronimo Stilton #11 & #12,Writtenby:GeronimoStilton,Narratedby:BillLobely,140,04-08-08,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,Writtenby:RickRiordan,Narratedby:RobbieDaymond,788,01-05-18,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,Writtenby:JeffKinney,Narratedby:DanRussell,123,06-11-20,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,Writtenby:RickRiordan,Narratedby:SoneelaNankani,676,05-10-21,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",Writtenby:RickRiordan,Narratedby:JesseBernstein,600,13-01-10,English,4.5 out of 5 stars181 ratings,820.0


In [10]:
# Convert 'Releasedate' column to datetime
df['Releasedate'] = pd.to_datetime(df['Releasedate'], format='%d-%m-%y')

In [11]:
# checking the datatype of release date
df.dtypes

Name                   object
Author                 object
Narrator               object
Time_mins               int64
Releasedate    datetime64[ns]
Language               object
Stars                  object
Price                  object
dtype: object

In [12]:
# Convert 'Price' column to float
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

In [13]:
# checking the datatype
df.dtypes

Name                   object
Author                 object
Narrator               object
Time_mins               int64
Releasedate    datetime64[ns]
Language               object
Stars                  object
Price                 float64
dtype: object

In [14]:
# printing out the author column
df["Author"]

0        Writtenby:GeronimoStilton
1            Writtenby:RickRiordan
2             Writtenby:JeffKinney
3            Writtenby:RickRiordan
4            Writtenby:RickRiordan
                   ...            
87484       Writtenby:ChrisStewart
87485      Writtenby:StephenO'Shea
87486          Writtenby:MarkTwain
87487     Writtenby:LaurenceSterne
87488      Writtenby:MarkKurlansky
Name: Author, Length: 87489, dtype: object

In [15]:
# Stripping the written by from the Authors column
df["Author"] =df["Author"].str.strip("Writtenby: ")
df["Author"]

0        GeronimoStilto
1            RickRiorda
2                 JeffK
3            RickRiorda
4            RickRiorda
              ...      
87484        ChrisStewa
87485     StephenO'Shea
87486           MarkTwa
87487         LaurenceS
87488      MarkKurlansk
Name: Author, Length: 87489, dtype: object

In [16]:
# printing out the narrator column
df["Narrator"]

0            Narratedby:BillLobely
1         Narratedby:RobbieDaymond
2            Narratedby:DanRussell
3        Narratedby:SoneelaNankani
4        Narratedby:JesseBernstein
                   ...            
87484      Narratedby:ChrisStewart
87485        Narratedby:RobertFass
87486         Narratedby:FloGibson
87487       Narratedby:AntonLesser
87488       Narratedby:FleetCooper
Name: Narrator, Length: 87489, dtype: object

In [17]:
# Stripping the narrated by from the Narrators column
df["Narrator"] =df["Narrator"].str.strip("Narratedby: ")
df["Narrator"]

0             BillLobel
1          RobbieDaymon
2            DanRussell
3        SoneelaNankani
4        JesseBernstein
              ...      
87484         ChrisStew
87485        RobertFass
87486         FloGibson
87487         AntonLess
87488         FleetCoop
Name: Narrator, Length: 87489, dtype: object

In [18]:
#cheking the dataset to see the effect of our stripping
df.head(5)

Unnamed: 0,Name,Author,Narrator,Time_mins,Releasedate,Language,Stars,Price
0,Geronimo Stilton #11 & #12,GeronimoStilto,BillLobel,140,2008-08-04,English,5 out of 5 stars34 ratings,468.0
1,The Burning Maze,RickRiorda,RobbieDaymon,788,2018-05-01,English,4.5 out of 5 stars41 ratings,820.0
2,The Deep End,JeffK,DanRussell,123,2020-11-06,English,4.5 out of 5 stars38 ratings,410.0
3,Daughter of the Deep,RickRiorda,SoneelaNankani,676,2021-10-05,English,4.5 out of 5 stars12 ratings,615.0
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiorda,JesseBernstein,600,2010-01-13,English,4.5 out of 5 stars181 ratings,820.0


In [19]:
# Spillting stars column into stars and ratings 
df[['Stars', 'Ratings']] = df['Stars'].str.split(' stars', expand=True)
df

Unnamed: 0,Name,Author,Narrator,Time_mins,Releasedate,Language,Stars,Price,Ratings
0,Geronimo Stilton #11 & #12,GeronimoStilto,BillLobel,140,2008-08-04,English,5 out of 5,468.0,34 ratings
1,The Burning Maze,RickRiorda,RobbieDaymon,788,2018-05-01,English,4.5 out of 5,820.0,41 ratings
2,The Deep End,JeffK,DanRussell,123,2020-11-06,English,4.5 out of 5,410.0,38 ratings
3,Daughter of the Deep,RickRiorda,SoneelaNankani,676,2021-10-05,English,4.5 out of 5,615.0,12 ratings
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiorda,JesseBernstein,600,2010-01-13,English,4.5 out of 5,820.0,181 ratings
...,...,...,...,...,...,...,...,...,...
87484,Last Days of the Bus Club,ChrisStewa,ChrisStew,454,2017-03-09,English,Not rated yet,596.0,
87485,The Alps,StephenO'Shea,RobertFass,607,2017-02-21,English,Not rated yet,820.0,
87486,The Innocents Abroad,MarkTwa,FloGibson,1144,2016-12-30,English,Not rated yet,938.0,
87487,A Sentimental Journey,LaurenceS,AntonLess,248,2011-02-23,English,Not rated yet,680.0,


In [20]:
# Stripping the ratings(word) from the Ratings column
df["Ratings"] =df["Ratings"].str.strip(" ratings")
df["Ratings"]

0          34
1          41
2          38
3          12
4         181
         ... 
87484    None
87485    None
87486    None
87487    None
87488    None
Name: Ratings, Length: 87489, dtype: object

In [21]:
# Checking the change effected
df.head(10)

Unnamed: 0,Name,Author,Narrator,Time_mins,Releasedate,Language,Stars,Price,Ratings
0,Geronimo Stilton #11 & #12,GeronimoStilto,BillLobel,140,2008-08-04,English,5 out of 5,468.0,34
1,The Burning Maze,RickRiorda,RobbieDaymon,788,2018-05-01,English,4.5 out of 5,820.0,41
2,The Deep End,JeffK,DanRussell,123,2020-11-06,English,4.5 out of 5,410.0,38
3,Daughter of the Deep,RickRiorda,SoneelaNankani,676,2021-10-05,English,4.5 out of 5,615.0,12
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiorda,JesseBernstein,600,2010-01-13,English,4.5 out of 5,820.0,181
5,The Hunger Games: Special Edition,SuzanneCollins,TatianaMaslan,635,2018-10-30,English,5 out of 5,656.0,72
6,Quest for the Diamond Sword,Morga,LukeDaniels,143,2014-11-25,English,5 out of 5,233.0,11
7,The Dark Prophecy,RickRiorda,RobbieDaymon,752,2017-05-02,English,5 out of 5,820.0,50
8,Merlin Mission Collection,MaryPopeOsbo,MaryPopeOsborn,656,2017-05-02,English,5 out of 5,,5
9,The Tyrant’s Tomb,RickRiorda,RobbieDaymon,802,2019-09-24,English,5 out of 5,820.0,58


In [22]:
# Cnverting Ratings(Object) to an integer 
df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce')
df['Ratings'] = df['Ratings'].fillna(0).astype(int)
df['Ratings']

0         34
1         41
2         38
3         12
4        181
        ... 
87484      0
87485      0
87486      0
87487      0
87488      0
Name: Ratings, Length: 87489, dtype: int32

In [23]:
# Checking the dataset to see the change in datatype
df.head(10)

Unnamed: 0,Name,Author,Narrator,Time_mins,Releasedate,Language,Stars,Price,Ratings
0,Geronimo Stilton #11 & #12,GeronimoStilto,BillLobel,140,2008-08-04,English,5 out of 5,468.0,34
1,The Burning Maze,RickRiorda,RobbieDaymon,788,2018-05-01,English,4.5 out of 5,820.0,41
2,The Deep End,JeffK,DanRussell,123,2020-11-06,English,4.5 out of 5,410.0,38
3,Daughter of the Deep,RickRiorda,SoneelaNankani,676,2021-10-05,English,4.5 out of 5,615.0,12
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiorda,JesseBernstein,600,2010-01-13,English,4.5 out of 5,820.0,181
5,The Hunger Games: Special Edition,SuzanneCollins,TatianaMaslan,635,2018-10-30,English,5 out of 5,656.0,72
6,Quest for the Diamond Sword,Morga,LukeDaniels,143,2014-11-25,English,5 out of 5,233.0,11
7,The Dark Prophecy,RickRiorda,RobbieDaymon,752,2017-05-02,English,5 out of 5,820.0,50
8,Merlin Mission Collection,MaryPopeOsbo,MaryPopeOsborn,656,2017-05-02,English,5 out of 5,,5
9,The Tyrant’s Tomb,RickRiorda,RobbieDaymon,802,2019-09-24,English,5 out of 5,820.0,58


In [24]:
# Splitting the stars column to do away with the "Out of 5"
df[['Stars', 'Example']] = df['Stars'].str.split(' out of 5', expand=True)
df

Unnamed: 0,Name,Author,Narrator,Time_mins,Releasedate,Language,Stars,Price,Ratings,Example
0,Geronimo Stilton #11 & #12,GeronimoStilto,BillLobel,140,2008-08-04,English,5,468.0,34,
1,The Burning Maze,RickRiorda,RobbieDaymon,788,2018-05-01,English,4.5,820.0,41,
2,The Deep End,JeffK,DanRussell,123,2020-11-06,English,4.5,410.0,38,
3,Daughter of the Deep,RickRiorda,SoneelaNankani,676,2021-10-05,English,4.5,615.0,12,
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiorda,JesseBernstein,600,2010-01-13,English,4.5,820.0,181,
...,...,...,...,...,...,...,...,...,...,...
87484,Last Days of the Bus Club,ChrisStewa,ChrisStew,454,2017-03-09,English,Not rated yet,596.0,0,
87485,The Alps,StephenO'Shea,RobertFass,607,2017-02-21,English,Not rated yet,820.0,0,
87486,The Innocents Abroad,MarkTwa,FloGibson,1144,2016-12-30,English,Not rated yet,938.0,0,
87487,A Sentimental Journey,LaurenceS,AntonLess,248,2011-02-23,English,Not rated yet,680.0,0,


In [25]:
# Calling out our Star column
df['Stars']

0                    5
1                  4.5
2                  4.5
3                  4.5
4                  4.5
             ...      
87484    Not rated yet
87485    Not rated yet
87486    Not rated yet
87487    Not rated yet
87488    Not rated yet
Name: Stars, Length: 87489, dtype: object

In [30]:
# Converting Stars(Object) to float
df['Stars'] = pd.to_numeric(df['Stars'].fillna(0), errors='coerce')
df['Stars']

0        5.0
1        4.5
2        4.5
3        4.5
4        4.5
        ... 
87484    0.0
87485    0.0
87486    0.0
87487    0.0
87488    0.0
Name: Stars, Length: 87489, dtype: float64

In [27]:
# dropping or removing the Example column created earlier
df = df.drop(columns=['Example'])

In [32]:
# Calling out our dataframe to see the effect of the dropped column
df

Unnamed: 0,Name,Author,Narrator,Time_mins,Releasedate,Language,Stars,Price,Ratings
0,Geronimo Stilton #11 & #12,GeronimoStilto,BillLobel,140,2008-08-04,English,5.0,468.0,34
1,The Burning Maze,RickRiorda,RobbieDaymon,788,2018-05-01,English,4.5,820.0,41
2,The Deep End,JeffK,DanRussell,123,2020-11-06,English,4.5,410.0,38
3,Daughter of the Deep,RickRiorda,SoneelaNankani,676,2021-10-05,English,4.5,615.0,12
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiorda,JesseBernstein,600,2010-01-13,English,4.5,820.0,181
...,...,...,...,...,...,...,...,...,...
87484,Last Days of the Bus Club,ChrisStewa,ChrisStew,454,2017-03-09,English,0.0,596.0,0
87485,The Alps,StephenO'Shea,RobertFass,607,2017-02-21,English,0.0,820.0,0
87486,The Innocents Abroad,MarkTwa,FloGibson,1144,2016-12-30,English,0.0,938.0,0
87487,A Sentimental Journey,LaurenceS,AntonLess,248,2011-02-23,English,0.0,680.0,0


In [31]:
# Checking the first 5 in the dataframe
df.head(5)

Unnamed: 0,Name,Author,Narrator,Time_mins,Releasedate,Language,Stars,Price,Ratings
0,Geronimo Stilton #11 & #12,GeronimoStilto,BillLobel,140,2008-08-04,English,5.0,468.0,34
1,The Burning Maze,RickRiorda,RobbieDaymon,788,2018-05-01,English,4.5,820.0,41
2,The Deep End,JeffK,DanRussell,123,2020-11-06,English,4.5,410.0,38
3,Daughter of the Deep,RickRiorda,SoneelaNankani,676,2021-10-05,English,4.5,615.0,12
4,"The Lightning Thief: Percy Jackson, Book 1",RickRiorda,JesseBernstein,600,2010-01-13,English,4.5,820.0,181


In [None]:
# checking the headers again
df.head(10)

In [None]:
df.dtypes