In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import chart_studio.plotly as py # pip install chart_studio, to anaconda terminal conda install -c plotly chart-studio
import seaborn as sns 
import plotly.express as px

# Read Data

In [3]:
#Users
u_cols = ['user_id', 'location', 'age']
users = pd.read_csv('../inputs/BX-Users.csv', sep=';', names=u_cols, encoding='latin-1', low_memory=False)

#Books
i_cols = ['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher', 'img_s', 'img_m', 'img_1']
items = pd.read_csv('../inputs/BX_Books.csv', sep=';', names=i_cols, encoding='latin-1', low_memory=False)

#Ratings
r_cols = ['user_id', 'isbn', 'rating']
ratings = pd.read_csv('../inputs/BX-Book-Ratings.csv', sep=';', names=r_cols, encoding='latin-1', low_memory=False)

In [4]:
users.head(5)

Unnamed: 0,user_id,location,age
0,User-ID,Location,Age
1,1,"nyc, new york, usa",
2,2,"stockton, california, usa",18
3,3,"moscow, yukon territory, russia",
4,4,"porto, v.n.gaia, portugal",17


In [5]:
items.head(5)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_1
0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
3,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
4,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...


In [6]:
ratings.head(5)

Unnamed: 0,user_id,isbn,rating
0,User-ID,ISBN,Book-Rating
1,276725,034545104X,0
2,276726,0155061224,5
3,276727,0446520802,0
4,276729,052165615X,3


# Drop Header Row

In [7]:
users.drop(users.index[0],inplace=True)
items.drop(items.index[0],inplace=True)
ratings.drop(ratings.index[0],inplace=True)

In [8]:
users.head()

Unnamed: 0,user_id,location,age
1,1,"nyc, new york, usa",
2,2,"stockton, california, usa",18.0
3,3,"moscow, yukon territory, russia",
4,4,"porto, v.n.gaia, portugal",17.0
5,5,"farnborough, hants, united kingdom",


In [9]:
items.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_1
1,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
3,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
4,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
5,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [10]:
ratings.head()

Unnamed: 0,user_id,isbn,rating
1,276725,034545104X,0
2,276726,0155061224,5
3,276727,0446520802,0
4,276729,052165615X,3
5,276729,0521795028,6


# Merge Datasets

In [11]:
df = pd.merge(users,ratings,on='user_id')
df = pd.merge(items,df,on='isbn')

df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_1,user_id,location,age,rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,"stockton, california, usa",18.0,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,"timmins, ontario, canada",,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,"ottawa, ontario, canada",49.0,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,"n/a, n/a, n/a",,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,"sudbury, ontario, canada",,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031175 entries, 0 to 1031174
Data columns (total 12 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   isbn                 1031175 non-null  object
 1   book_title           1031175 non-null  object
 2   book_author          1031174 non-null  object
 3   year_of_publication  1031175 non-null  object
 4   publisher            1031173 non-null  object
 5   img_s                1031175 non-null  object
 6   img_m                1031175 non-null  object
 7   img_1                1031175 non-null  object
 8   user_id              1031175 non-null  object
 9   location             1031175 non-null  object
 10  age                  753330 non-null   object
 11  rating               1031175 non-null  object
dtypes: object(12)
memory usage: 102.3+ MB


In [13]:
df.shape

(1031175, 12)

# Drop Null Values

In [14]:
df.dropna(inplace=True)
df.isnull().sum()

isbn                   0
book_title             0
book_author            0
year_of_publication    0
publisher              0
img_s                  0
img_m                  0
img_1                  0
user_id                0
location               0
age                    0
rating                 0
dtype: int64

# Drop Some Columns

In [15]:
df.columns

Index(['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_s', 'img_m', 'img_1', 'user_id', 'location', 'age', 'rating'],
      dtype='object')

In [16]:
df.drop(['isbn','img_s','img_m','user_id','img_1'],axis=1,inplace=True)
df.head()

Unnamed: 0,book_title,book_author,year_of_publication,publisher,location,age,rating
0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"stockton, california, usa",18,0
2,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"ottawa, ontario, canada",49,0
5,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"toronto, ontario, canada",30,8
6,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"victoria, british columbia, canada",36,0
7,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"ottawa, ontario, canada",29,0


# Change Data Types

In [17]:
df['age']=df['age'].astype(int)

# Splite Location Information

In [18]:
location = df.location.str.split(', ', n=2, expand=True)
location = df.location.str.split(',', n=2, expand=True)

df['city'] = location[0]
df['state'] = location[1]
df['country'] = location[2]

df.head()

Unnamed: 0,book_title,book_author,year_of_publication,publisher,location,age,rating,city,state,country
0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,"stockton, california, usa",18,0,stockton,california,usa
2,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"ottawa, ontario, canada",49,0,ottawa,ontario,canada
5,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"toronto, ontario, canada",30,8,toronto,ontario,canada
6,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"victoria, british columbia, canada",36,0,victoria,british columbia,canada
7,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"ottawa, ontario, canada",29,0,ottawa,ontario,canada


In [19]:
df = df[df['age']!=0]
df = df[df['rating']!=0]

fig = px.sunburst(df,
                  color=("state"),
                  values="age",
                  path=["country","state"],
                  hover_name="age",
                  height=700)

fig.show()

# Age graph with users' countries and cities

In [20]:
fig = px.treemap(df,
                  color="state",
                 values = "age",
                  path=["country","state"],
                  hover_name="age",
                  height=700)

fig.show()

# Age graph with users' countries and cities

In [21]:
df_cut = df.sample(n = 500)

# We get 500 random data from the dataset because the dataset is too big

In [22]:
import plotly.graph_objects as go

countries = df_cut.groupby("country").size().reset_index(name='count')
fig = go.Figure(data=go.Choropleth(
    locations = countries["country"],
    locationmode = 'country names',
    z = countries["count"],
    colorscale = 'Inferno',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.1,
    colorbar_title = "Count",
))

fig.update_layout(
    title_x=0.5,
    title_text = "Country Distribution of Users Giving Rate",)
fig.show()

# Distribution of users in countries around the world