# Chapter 1

## Introduction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('countries-of-the-world.csv')
df.head()

### Scatterplot

In [None]:
# grab the columns
df.columns                                                                 

In [None]:
# replace comma by dot in df column.
phones = list(df['Phones (per 1000)'].str.replace(',','.'))      

# create an empty list and convert strings into floats and append to the list
phones_list = []                                                 
for p in phones:                                                 
    p = float(p)
    phones_list.append(p)                                       

In [None]:
# same as above
percent_literate = list(df['Literacy (%)'].str.replace(',','.')) 
percent_literate_list = []
for p in percent_literate:
    p = float(p)
    percent_literate_list.append(p)

In [None]:
# GDP is already OK
gdp = list(df['GDP ($ per capita)'])                             

In [None]:
# create scatter plot with lists. These are lists, so we don't have to mention the dataframe or series name.
sns.scatterplot(x=gdp, y=phones_list)                           

In [None]:
sns.scatterplot(x=gdp, y=percent_literate_list)

In [None]:
# Keeping this just for memory.
# df = df.dropna(how='any')  

### Count plot:

In [None]:
# In the last exercise, we explored a dataset that contains information about 227 countries. Let's do more exploration of this data - specifically,
# how many countries are in each region of the world?
# To do this, we'll need to use a count plot. Count plots take in a categorical list and return bars that represent the number of list entries per 
# category. You can create one here using a list of regions for each country, which is a variable named region

# create a list from df column/series
region = list(df['Region'])  
region[0:5]                    # display the first 5 elements in the list

In [None]:
# create a count plot with list (region) on y-axis. region is a categorical variable.
sns.countplot(y=region)                                                               

In [None]:
# same above plot produced but list (region) on x-axis.
sns.countplot(x=region)                                                     

## Using Pandas with Seaborn

In [None]:
# read the csv file into dataframe
df = pd.read_csv('young-people-survey-responses.csv')
df.head()

In [None]:
# use Pandas data structure (series/data frame) to create count plot. Spider is a categorical variable. It will count the number of ones, twos etc in column, Spiders
sns.countplot(x= 'Spiders', data= df)

In [None]:
# same as above but use df['column'] this time. With this there is no need for keyword argument, data.
sns.countplot(x= df['Spiders'])

In [None]:
# extra work just to remember how to do value count of categorical variable
df['Spiders'].value_counts()

## Adding a third variable with hue

In [None]:
df = pd.read_csv('student-alcohol-consumption.csv')
df.head()

In [None]:
df.columns

In [None]:
# create scatterplot with color coding for third variable
sns.scatterplot(x= 'absences', y= 'G3', data= df, hue= 'location', hue_order= ['Rural', 'Urban'], palette= {'Rural':'green', 'Urban':'blue'})

In [None]:
# same as above but with default colors and columns selected as df['column']
sns.scatterplot(x= df['absences'], y= df['G3'], hue= df['location'], hue_order= ['Rural', 'Urban'])

In [None]:
# Let's continue exploring our dataset from students in secondary school by looking at a new variable. The "school" column indicates the initials of
# which school the student attended - either "GP" or "MS". 
# In the last exercise, we created a scatter plot where the plot points were colored based on whether the student lived in an urban or rural area. 
# How many students live in urban vs. rural areas, and does this vary based on what school the student attends? Let's make a count plot with 
# subgroups to find out.

# Fill in the palette_colors dictionary to map the "Rural" location value to the color "green" and the "Urban" location value to the color "blue"
palette_colors= {'Rural':'cyan', 'Urban':'orange'}

#Create a count plot with "school" on the x-axis.Add subgroups to the plot using "location" variable and use the palette_colors dictionary to make 
# the location subgroups green and blue.
sns.countplot(x= df['school'], hue= df['location'], palette= palette_colors )

# for comparison
#sns.countplot(x= df['school'])

# Chapter 2

## Relational plots and subplots

In [None]:
df = pd.read_csv('student-alcohol-consumption.csv')
df.head()

In [None]:
# Modify the code to use relplot() instead of scatterplot().
sns.relplot(x= df['absences'], y= df['G3'], kind= 'scatter')

In [None]:
# Modify the code to create one scatter plot for each level of the variable "study_time", arranged in columns.
sns.relplot(x= df['absences'], y= df['G3'], kind= 'scatter', col= df['study_time'])

In [None]:
# Adapt your code to create one scatter plot for each level of a student's weekly study time, this time arranged in rows.
sns.relplot(x= df['absences'], y= df['G3'], kind= 'scatter', row= df['study_time'])

In [None]:
# Let's continue looking at the dataset of students in secondary school. Here, we want to answer the following question: does 
# a student's first semester grade ("G1") tend to correlate with their final grade ("G3")?
# There are many aspects of a student's life that could result in a higher or lower final grade in the class. For example, some students receive 
# extra educational support from their school ("schoolsup") or from their family ("famsup"), which could result in higher grades. Let's try to 
# control for these two factors by creating subplots based on whether the student received extra educational support from their school or family.

# Use relplot() to create a scatter plot with "G1" on the x-axis and "G3" on the y-axis.
sns.relplot(x= df['G1'], y= df['G3'], kind= 'scatter')

In [None]:
# Create column subplots based on whether the student received support from the school ("schoolsup"), ordered so that "yes" comes before "no".
sns.relplot(x= df['G1'], y= df['G3'], kind= 'scatter', col= df['schoolsup'], col_order= ['yes', 'no'])

In [None]:
# Add row subplots based on whether the student received support from the family ("famsup"), ordered so that "yes" comes before "no". This will 
# result in subplots based on two factors.
sns.relplot(x= df['G1'], y= df['G3'], kind= 'scatter', col= df['schoolsup'], col_order= ['yes', 'no'], row= df['famsup'], row_order= ['yes', 'no'])

## Customizing scatter plots

In [None]:
df= pd.read_csv('mpg.csv')
df.head()

In [None]:
# Changing the size of scatter plot points
# In this exercise, we'll explore Seaborn's mpg dataset, which contains one row per car model and includes information such as the year the car was
# made, the number of miles per gallon ("M.P.G.") it achieves, the power of its engine (measured in "horsepower"), and its country of origin.
# What is the relationship between the power of a car's engine ("horsepower") and its fuel efficiency ("mpg")? And how does this relationship vary 
# by the number of cylinders ("cylinders") the car has? Let's find out.
# Let's continue to use relplot() instead of scatterplot() since it offers more flexibility.

# Use relplot() to create a scatter plot with "horsepower" on the x-axis and "mpg" on the y-axis. Vary the size of the points by the number of 
# cylinders in the car ("cylinders").
sns.relplot(x= df['horsepower'], y= df['mpg'], kind= 'scatter', size= df['cylinders'])

In [None]:
# To make this plot easier to read, use hue to vary the color of the points by the number of cylinders in the car ("cylinders").
sns.relplot(x= df['horsepower'], y= df['mpg'], kind= 'scatter', size= df['cylinders'], hue= df['cylinders'])


In [None]:
# Changing the style of scatter plot points
# Let's continue exploring by looking at the relationship between how fast a car can accelerate ("acceleration") and its fuel efficiency ("mpg"). Do
# these properties vary by country of origin ("origin")?
# Note that the "acceleration" variable is the time to accelerate from 0 to 60 miles per hour, in seconds. Higher values indicate slower 
# acceleration.

# Use relplot() to create a scatter plot with "acceleration" on the x-axis and "mpg" on the y-axis. Vary the style and color of the plot points by 
# country of origin ("origin"). (Note: This time I am using the alternative way to create scatter rather than df['column'] for record & memory).
sns.relplot(x= 'acceleration', y= 'mpg', data= df, kind= 'scatter', hue= 'origin', style= 'origin')



## Line plots

In [None]:
# Interpreting line plots
# In this exercise, we'll continue to exploring the same data frame. we will find out that how has the average miles per gallon achieved by the cars
# changed over time? Let's use line plots to find out!

# Use relplot() to create a line plot with "model_year" on the x-axis and "mpg" on the y-axis. The thick line is mean of observation per x-values
# and the shaded area is the confidence interval around that mean.
sns.relplot(x= 'model_year', y= 'mpg', data= df, kind= 'line')

In [None]:
# Visualizing standard deviation with line plots
# In the last exercise, we looked at how the average miles per gallon achieved by cars has changed over time. Now let's use a line plot to visualize
# how the distribution of miles per gallon has changed over time.

# Change the plot so the shaded area shows the standard deviation instead of the confidence interval for the mean. Errorbar can give confident inte-
# rval or standard deviation etc. errorbar= None produces no shaded area but only line.
sns.relplot(x= 'model_year', y= 'mpg', data= df, kind= 'line', ci= 'sd')    # try errorbar= 'sd' if ci is not working

In [None]:
# Use relplot() to create a line plot with "model_year" on the x-axis and "horsepower" on the y-axis. Turn off the confidence intervals on the plot.
sns.relplot(x= 'model_year', y= 'horsepower', data= df, kind= 'line', ci= None) # replace ci by errorbar if ci not working

In [None]:
# Plotting subgroups in line plots
# We've seen that the average miles per gallon for cars has increased over time, but how has the average horsepower for cars changed over time? And 
# does this trend differ by country of origin?

# Create different lines for each country of origin ("origin") that vary in both line style and color.
sns.relplot(x= 'model_year', y= 'horsepower', data= df, kind= 'line', style= 'origin', hue= 'origin', ci= None)



In [None]:
# Add markers for each data point to the lines.
sns.relplot(x= 'model_year', y= 'horsepower', data= df, kind= 'line', style= 'origin', hue= 'origin', ci= None, markers= True)

In [None]:
# Use the dashes parameter to use solid lines for all countries, while still allowing for different marker styles for each line.
sns.relplot(x= 'model_year', y= 'horsepower', data= df, kind= 'line', style= 'origin', hue= 'origin', ci= None, markers= True, dashes= False)

# Chapter 3

## Count plots and bar plots

In [None]:
df= pd.read_csv('young-people-survey-responses.csv')
df.head()

In [None]:
# In this exercise, we'll return to exploring our dataset that contains the responses to a survey sent out to young people. We might suspect that 
# young people spend a lot of time on the internet, but how much do they report using the internet each day? Let's use a count plot to break down 
# the number of survey responses in each category and then explore whether it changes based on age.
# As a reminder, to create a count plot, we'll use the catplot() function and specify the name of the categorical variable to count (x=____), the 
# pandas DataFrame to use (data=____), and the type of plot (kind="count").

# Use sns.catplot() to create a count plot using the survey DataFrame with "Internet usage" on the x-axis.
sns.catplot(x= 'Internet usage', data= df, kind= 'count')

In [None]:
# Make the bars horizontal instead of vertical.
sns.catplot(y= 'Internet usage', data= df, kind= 'count')

In [None]:
# Separate this plot into two side-by-side column subplots based on "Age Category", which separates respondents into those that are younger than 21 
# vs. 21 and older. (First, I need to create 'Age Category' column since it does not exist in my data frame).

import numpy as np
ranges = [0, 20, np.inf]
names = ['Less than 21', '21+']
df['Age Category'] = pd.cut(df['Age'], bins= ranges, labels = names)
df

In [None]:
sns.catplot(y= 'Internet usage', data= df, kind= 'count', col= 'Age Category')

In [None]:
# Bar plots with percentages
# Let's continue exploring the responses to a survey sent out to young people. The variable "Interested in Math" is True if the person reported 
# being interested or very interested in mathematics, and False otherwise. What percentage of young people report being interested in math, and does
# this vary based on gender? Let's use a bar plot to find out.
# As a reminder, we'll create a bar plot using the catplot() function, providing the name of categorical variable to put on the x-axis (x=____), the
# name of the quantitative variable to summarize on the y-axis (y=____), the pandas DataFrame to use (data=____), and the type of categorical plot 
# (kind="bar").

# Use the sns.catplot() to create a bar plot with "Gender" on the x-axis and "Interested in Math" on the y-axis. (First, I need to create 
# 'Interested in Math' column since it does not exist in my data frame).
df['Mathematics'].unique()

In [None]:
# del(df['Interested in Math'])
df

In [None]:
ranges = [0, 3, 5]
names = [False, True]     # False for 0 and True for 1
df['Interested in Math'] = pd.cut(df['Mathematics'], bins = ranges, labels = names)
df[['Mathematics', 'Interested in Math']]

In [None]:
# Either the variable on x-axis or the variable on y-axis must be quantitative in bar plot. Boolean is either 1 or 0, so it is quantitative.
df['Interested in Math'] = df['Interested in Math'].astype(bool)
df['Interested in Math'].dtype


In [None]:
# by default, the confidence intervals are produced are shown as well along the means showns by bars.
sns.catplot(x= 'Gender', y= 'Interested in Math', data= df, kind= 'bar')

In [None]:
# Customizing bar plots
# In this exercise, we'll explore data from students in secondary school. The "study_time" variable records each student's reported weekly study 
# time as one of the following categories: "<2 hours", "2 to 5 hours", "5 to 10 hours", or ">10 hours". Do students who report higher amounts of 
# studying tend to get better final grades? Let's compare the average final grade among students in each category using a bar plot.

# Use sns.catplot() to create a bar plot with "study_time" on the x-axis and final grade ("G3") on the y-axis, using the data frame.

In [None]:
# First import the relevant data frame as:
df = pd.read_csv('student-alcohol-consumption.csv')
df

In [None]:
# now create the required graph
sns.catplot(x= 'study_time', y= 'G3', data= df, kind= 'bar')

In [None]:
# Using the order parameter and the category_order list that is provided, rearrange the bars so that they are in order from lowest study time to 
# highest.
category_order = ['<2 hours', '2 to 5 hours', '5 to 10 hours', '>10 hours']
sns.catplot(x = 'study_time', y= 'G3', data= df, kind= 'bar', order= category_order)

In [None]:
# Update the plot so that it no longer displays confidence intervals.
sns.catplot(x = 'study_time', y= 'G3', data= df, kind= 'bar', order= category_order, ci= None)  # replace ci by errorbar if ci is not working

## Box plots

In [None]:
# In an earlier exercise, we explored the relationship between studying and final grade by using a bar plot to compare the average final grade ("G3")
# among students in different categories of "study_time". 
# In this exercise, we'll try using a box plot look at this relationship instead. As a reminder, to create a box plot you'll need to use the 
# catplot() function and specify the name of the categorical variable to put on the x-axis (x=____), the name of the quantitative variable to 
# summarize on the y-axis (y=____), the pandas DataFrame to use (data=____), and the type of plot (kind="box").

# Use sns.catplot() and the student_data DataFrame to create a box plot with "study_time" on the x-axis and "G3" on the y-axis. Set the ordering of 
# the categories to study_time_order
study_time_order= ['<2 hours', '2 to 5 hours', '5 to 10 hours', '>10 hours']
sns.catplot(x= 'study_time', y= 'G3', data= df, kind= 'box', order= study_time_order)

In [None]:
# Omitting outliers
# Now let's use the dataset to compare the distribution of final grades ("G3") between students who have internet access at home and those who don't.
# To do this, we'll use the "internet" variable, which is a binary (yes/no) indicator of whether the student has internet access at home.
# Since internet may be less accessible in rural areas, we'll add subgroups based on where the student lives. For this, we can use the "location" 
# variable, which is an indicator of whether a student lives in an urban ("Urban") or rural ("Rural") location.

# Use sns.catplot() to create a box plot, putting "internet" on the x-axis and "G3" on the y-axis.
sns.catplot(x= 'internet', y= 'G3', data= df, kind= 'box')

In [None]:
# Add subgroups so each box plot is colored based on "location"
sns.catplot(x= 'internet', y= 'G3', data= df, kind= 'box', hue= 'location')

In [None]:
# Do not display the outliers.
sns.catplot(x= 'internet', y= 'G3', data= df, kind= 'box', hue= 'location', sym= "")

In [None]:
# Adjusting the whiskers
# In the lesson we saw that there are multiple ways to define the whiskers in a box plot. In this set of exercises, we'll continue to use the same
# dataset to compare the distribution of final grades ("G3") between students who are in a romantic relationship and those that are not. We'll use 
# the "romantic" variable, which is a yes/no indicator of whether the student is in a romantic relationship.
# Let's create a box plot to look at this relationship and try different ways to define the whiskers.

# Adjust the code to make the box plot whiskers to extend to 0.5 * IQR. Recall: the IQR is the interquartile range.
sns.catplot(x= 'internet', y= 'G3', data= df, kind= 'box', whis= 0.5)

In [None]:
# Change the code to set the whiskers to extend to the 5th and 95th percentiles.
sns.catplot(x= 'internet', y= 'G3', data= df, kind= 'box', whis= [5, 95])

In [None]:
# Change the code to set the whiskers to extend to the min and max values.
sns.catplot(x= 'internet', y= 'G3', data= df, kind= 'box', whis= [0, 100])

## Point plots

In [None]:
# Customizing point plots
# Let's continue to look at data from students in secondary school, this time using a point plot to answer the question: does the quality of the 
# student's family relationship influence the number of absences the student has in school? Here, we'll use the "famrel" variable, which describes 
# the quality of a student's family relationship from 1 (very bad) to 5 (very good).
# As a reminder, to create a point plot, use the catplot() function and specify the name of the categorical variable to put on the x-axis (x=____), 
# the name of the quantitative variable to summarize on the y-axis (y=____), the pandas DataFrame to use (data=____), and the type of categorical 
# plot (kind="point").

# Use sns.catplot() to create a point plot with "famrel" on the x-axis and number of absences ("absences") on the y-axis.
sns.catplot(x= 'famrel', y= 'absences', data= df, kind= 'point')

In [None]:
# Add "caps" to the end of the confidence intervals with size 0.2.
sns.catplot(x= 'famrel', y= 'absences', data= df, kind= 'point', capsize= 0.2)

In [None]:
# Remove the lines joining the points in each category.
sns.catplot(x= 'famrel', y= 'absences', data= df, kind= 'point', capsize= 0.2, join= False)

In [None]:
# Point plots with subgroups
# Let's continue exploring the dataset of students in secondary school. This time, we'll ask the question: is being in a romantic relationship 
# associated with higher or lower school attendance? And does this association differ by which school the students attend? Let's find out using 
# a point plot.

# Use sns.catplot() to create a point plot with relationship status ("romantic") on the x-axis and number of absences("absences") on the y-axis. 
# Color the points based on the school that they attend ("school").
sns.catplot(x= 'romantic', y= 'absences', data= df, kind= 'point', hue= 'school')

In [None]:
# Turn off the confidence intervals for the plot
sns.catplot(x= 'romantic', y= 'absences', data= df, kind= 'point', hue= 'school', ci= None) # ci or errorbar

In [None]:
# Since there may be outliers of students with many absences, use the median function that we've imported from numpy to display the median number of
# absences instead of the average.
from numpy import median
sns.catplot(x= 'romantic', y= 'absences', data= df, kind= 'point', hue= 'school', ci= None, estimator= median)

# Chapter 4

## Plot style and color

In [None]:
df = pd.read_csv('young-people-survey-responses.csv')
df

In [None]:
# Changing style and palette
# Let's return to our dataset containing the results of a survey given to young people about their habits and preferences. We've provided the code 
# to create a count plot of their responses to the question "How often do you listen to your parents' advice?". Now let's change the style and 
# palette to make this plot easier to interpret.

# Set the style to "whitegrid" to help the audience determine the number of responses in each category. (But first I need to create a column based 
# on existing column, Parents' advice, in the data frame as):


In [None]:
# find the unique values of the existing column
df["Parents' advice"].unique()

In [None]:
# some more info about the existing column
df["Parents' advice"].value_counts()

In [None]:
# some more info about the existing column
df["Parents' advice"].isnull().sum()

In [None]:
# create a new column now
ranges = [0, 1, 2, 3, 4, 5]
names = ['Never', 'Rarely', 'Sometimes', 'Often', 'Always']
df['parents advice'] = pd.cut(df["Parents' advice"], bins = ranges, labels= names)

df[["Parents' advice", "parents advice"]]

In [None]:
# Now let's create the graph
# Set the style to "whitegrid" to help the audience determine the number of responses in each category.
sns.set_style('whitegrid')
sns.catplot(x= 'parents advice', data= df, kind= 'count', order= names)

In [None]:
# Set the color palette to the sequential palette named "Purples".
sns.set_style('whitegrid')
sns.set_palette('Purples')
sns.catplot(x= 'parents advice', data= df, kind= 'count', order= names)

In [None]:
# Change the color palette to the diverging palette named "RdBu"
sns.set_style('whitegrid')
#sns.set_palette('Purples')
sns.set_palette('RdBu')
sns.catplot(x= 'parents advice', data= df, kind= 'count', order= names)

In [None]:
# Changing the scale
# In this exercise, we'll continue to look at the dataset containing responses from a survey of young people. Does the percentage of people reporting
# that they feel lonely vary depending on how many siblings they have? Let's find out using a bar plot, while also exploring Seaborn's four different
# plot scales ("contexts"). (First, I need to create 'feels lonely' variable based on existing variable 'Loneliness' as:)

ranges= [0, 3, 5]
names= [False, True]
df['feels lonely']= pd.cut(df['Loneliness'], bins= ranges, labels= names)
df[['Loneliness', 'feels lonely']].head()

In [None]:
# current data type of 'feels lonely'
df['feels lonely'].dtype

In [None]:
# convert the type to boolean
df['feels lonely'] = df['feels lonely'].astype(bool)
df['feels lonely'].dtype

In [None]:
# create a variable, number of siblings, from the variable, Siblings as:
ranges = [-1, 0, 2, np.inf] # greater than -1 and equal to or less than 0 = '0'. greater than 0 and equal to or less than 2 = '1-2'. greater than
# 2 = '3+'
names = ['0', '1-2', '3+' ]
df['number of siblings'] = pd.cut(df['Siblings'], bins= ranges, labels= names)
df[['Siblings', 'number of siblings']].head()

In [None]:
# Set the scale ("context") to "paper", which is the smallest of the scale options.
sns.set_style('white')
sns.set_palette('muted')  # deep, muted, pastel , bright , dark , and colorblind .
sns.set_context('paper')
sns.catplot(x= 'number of siblings', y= 'feels lonely', data= df, kind= 'bar')

In [None]:
# Change the context to "notebook" to increase the scale.
sns.set_context('notebook')
sns.catplot(x= 'number of siblings', y= 'feels lonely', data= df, kind= 'bar')

In [None]:
# Change the context to "talk" to increase the scale.
sns.set_context('talk')
sns.catplot(x= 'number of siblings', y= 'feels lonely', data= df, kind= 'bar')

In [None]:
# Change the context to "poster", which is the largest scale available.
sns.set_context('poster')
sns.catplot(x= 'number of siblings', y= 'feels lonely', data= df, kind= 'bar')

In [None]:
# Using a custom palette
# So far, we've looked at several things in the dataset of survey responses from young people, including their internet usage, how often they listen
# to their parents, and how many of them report feeling lonely. However, one thing we haven't done is a basic summary of the type of people answering
# this survey, including their age and gender. Providing these basic summaries is always a good practice when dealing with an unfamiliar dataset.

# The code provided will create a box plot showing the distribution of ages for male versus female respondents. Let's adjust the code to customize 
# the appearance, this time using a custom color palette.

# Set the style to "darkgrid".
# Set a custom color palette with the hex color codes "#39A7D0" and "#36ADA4".

custom_palette = ['#39A7D0', '#36ADA4']
sns.set_context('paper') # to revert the previous change in the note book or to restore default
sns.set_style('darkgrid')
sns.set_palette(custom_palette)
sns.catplot(x = 'Gender', y= 'Age', data= df, kind= 'box')


## Titles and labels: Part 1

In [None]:
df= pd.read_csv('mpg.csv')
df.head()

In [None]:
# FacetGrids vs. AxesSubplots
# In the recent lesson, we learned that Seaborn plot functions create two different types of objects: FacetGrid objects and AxesSubplot objects. The
# method for adding a title to your plot will differ depending on the type of object it is.
# In the code provided, we've used relplot() with the miles per gallon dataset to create a scatter plot showing the relationship between a car's 
# weight and its horsepower. This scatter plot is assigned to the variable name g. Let's identify which type of object it is.

#Identify what type of object plot g is
g= sns.relplot(x= 'weight', y= 'horsepower', data= df, kind= 'scatter')

In [None]:
# The resulting graph is an object of this class, FacetGrid.
print(type(g))

In [None]:
# Add the following title to this plot: "Car Weight vs. Horsepower".
g= sns.relplot(x= 'weight', y= 'horsepower', data= df, kind= 'scatter')
g.fig.suptitle('Car Weight vs. Horsepower')


## Title and labels: Part 2

In [None]:
df= pd.read_csv('mpg.csv')
df.head()

In [None]:
# Adding a title and axis labels
# Let's continue to look at the miles per gallon dataset. This time we'll create a line plot to answer the question: How does the average miles per 
# gallon achieved by cars change over time for each of the three places of origin? To improve the readability of this plot, we'll add a title and 
# more informative axis labels.
# In the code provided, we create the line plot using the lineplot() function. Note that lineplot() does not support the creation of subplots, so it
# returns an AxesSubplot object instead of an FacetGrid object.

# Add the following title to the plot: "Average MPG Over Time". (But first we need to find the mean of mpg and insert the column for it as:)
df1 = pd.DataFrame(df.groupby(['model_year', 'origin'])['mpg'].agg(np.mean))
df1 = df1.reset_index(level=['model_year', 'origin'])    # change the inexes into regular columns
df1 = df1.rename(columns= {'mpg':'mpg_mean'})
df1.head()
# df1.shape

In [None]:
# Now we can add the following title to the plot: "Average MPG Over Time".
g= sns.lineplot(x= 'model_year', y= 'mpg_mean', data= df1, hue= 'origin')
g.set_title('Average MPG Over Time')
# Label the x-axis as "Car Model Year" and the y-axis as "Average MPG".
g.set(xlabel= 'Car Model Year', ylabel= 'Average MPG')
plt.show()

In [None]:
# g is an object of this class, AxesSubplot, which creates a signle plot.
print(type(g))

In [None]:
# Rotating x-tick labels
# In this exercise, we'll continue looking at the miles per gallon dataset. In the code provided, we create a point plot that displays the average 
# acceleration for cars in each of the three places of origin. Note that the "acceleration" variable is the time to accelerate from 0 to 60 miles 
# per hour, in seconds. Higher values indicate slower acceleration.
# Let's use this plot to practice rotating the x-tick labels. Recall that the function to rotate x-tick labels is a standalone Matplotlib function 
# and not a function applied to the plot object itself.

# Rotate the x-tick labels 90 degrees.
sns.catplot(x= 'origin', y= 'acceleration', data= df, kind= 'point', join= False, capsize= 0.1)
plt.xticks(rotation= 90)

In [None]:
df.head()

## Putting it all together

In [None]:
df= pd.read_csv('young-people-survey-responses.csv')
df.head()

In [None]:
# First create a new variable, interested in pets, based on the variable, Pets as:
df['Pets'].unique()

In [None]:
ranges= [0, 3, 5]
names= ['No', 'Yes']
df['interested in pets']= pd.cut(df['Pets'], bins= ranges, labels= names)
df.head()

In [None]:
# Box plot with subgroups
# In this exercise, we'll look at the dataset containing responses from a survey given to young people. One of the questions asked of the young 
# people was: "Are you interested in having pets?" Let's explore whether the distribution of ages of those answering "yes" tends to be higher or 
# lower than those answering "no", controlling for gender.

# Set the color palette to "Blues".
# Add subgroups to color the box plots based on "Interested in Pets".
sns.set_palette('Blues')
g= sns.catplot(x= 'Gender', y= 'Age', data= df, kind= 'box', hue= 'interested in pets')
# Set the title of the FacetGrid object g to "Age of Those Interested in Pets vs. Not"
g.fig.suptitle('Age of Those Interested in Pets vs. Not', y= 1.03)

In [None]:
# Bar plot with subgroups and subplots
# In this exercise, we'll return to our young people survey dataset and investigate whether the proportion of people who like techno music ("Likes 
# Techno") varies by their gender ("Gender") or where they live ("Village - town"). This exercise will give us an opportunity to practice the many 
# things we've learned throughout this course!

# First get the data frame ready by creating a variable, likes techno, based on variable, Techno as:
df['Techno'].unique()

In [None]:
ranges = [0, 3, 5]
names = [False, True]
df['likes techno'] = pd.cut(df['Techno'], bins=ranges, labels=names)
df[['Techno', 'likes techno']].head()

In [None]:
# existing type of variable, likes techno
df['likes techno'].dtype

In [None]:
# convert the data type of the variable, likes techno, into boolean
df['likes techno'] = df['likes techno'].astype(bool)
df['likes techno'].dtype

In [None]:
sns.set_palette('muted')
# Set the figure style to "dark".
sns.set_style('dark')

# Adjust the bar plot code to add subplots based on "Gender", arranged in columns.
g= sns.catplot(x= 'Village - town', y= 'likes techno', data= df, kind= 'bar', col= 'Gender')

# Add the title "Percentage of Young People Who Like Techno" to this FacetGrid plot.
g.fig.suptitle('Percentage of Young People Who Like Techno', y= 1.03)

# Label the x-axis "Location of Residence" and y-axis "% Who Like Techno"
g.set(xlabel= 'Location of Residence', ylabel= '% Who Like Techno')


In [None]:
df

# THE END