# Lab | Matplotlib & Seaborn

#### Import all the necessary libraries here:

In [None]:
import numpy as np
import pandas as pd

import pylab as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

## Challenge 
#### Import the `Fitbit2` dataset and store it in a variable called `fitbit`. You can find the dataset in Ironhack's database:
* db: `fitbit`
* table: `fitbit2`

In [None]:
fitbit = pd.read_csv('../data/Fitbit2.csv')
fitbit.head()

#### From the Fitbit data, we want to visually understand:

How the average number of steps change by month. Use the appropriate visualization to show the median steps by month. Is Fitbitter more active on weekend or workdays?
**Hints**:

* Use Months_encoded and Week_or Weekend columns.
* Use matplolib.pyplot object oriented API.
* Set your size figure to 12,4
* Explore plt.sca
* Explore plt.xticks
* Save your figures in a folder called `figures` in your repo. 

In [None]:
fitbit.columns

In [None]:
repre1 = fitbit.groupby('Months_encoded').mean('Steps')['Steps']
repre1

In [None]:
plt.rcParams['figure.figsize'] = (12,4)

In [None]:
sns.barplot(x=repre1.index, y=repre1,
              palette='magma');

In [None]:
repre2 = fitbit.groupby('Work_or_Weekend').mean('Steps')['Steps']
repre2

In [None]:
sns.barplot(x=repre2.index, y=repre2,
              palette='Greens')

#### Write a loop to plot 3 scatter plots of the following features:

* Minutes Lightly Active vs Steps    
* Minutes Very Active vs Steps    
* Minutes Sedentary vs Steps  

In [None]:
fitbit.columns

In [None]:
features = ['Minutes Lightly Active', 'Minutes Very Active', 'Minutes Sedentary'] 

for e in features:
    sns.scatterplot(data=fitbit, x=e, y='Steps')

In [None]:
fig, ax = plt.subplots(1, 3)
for i,e in enumerate(['Minutes Lightly Active', 'Minutes Very Active', 'Minutes Sedentary']):
    fitbit.plot(ax=ax[i], x=e, y='Steps', kind='scatter');

## Challenge 

#### Import the `titanic` dataset and store it in a variable called `titanic`. You can find the dataset in Ironhack's database:
* db: `titanic`
* table: `titanic`

In [None]:
titanic = pd.read_csv('../data/titanic.csv')
titanic.head()

#### Explore the titanic dataset using Pandas dtypes.

In [None]:
titanic.info()

#### What are your numerical variables? What are your categorical variables?
**Hint**: Use Pandas select_dtypes.

In [None]:
titanic.select_dtypes(include=['object']).head()

In [None]:
titanic.select_dtypes(include=['int64','float64']).head()

#### Set the plot style to classic and the figure size to (12,6).
**Hint**: To set the style you can use matplotlib or seaborn functions. Do some research on the matter.

In [None]:
plt.style.use('classic')
plt.rcParams['figure.figsize'] = (12,6)

#### Use the right visulalization to show the distribution of column `Age`.

In [None]:
titanic.Age.plot.hist();

#### Use subplots and plot the distribution of the `Age`  with bins equal to 10, 20 and 50.

In [None]:
fig, ax = plt.subplots(3, 1)
bins1 = [10,20,50]
for i, e in enumerate(bins1):
    sns.histplot(x=titanic.Age, bins=e, ax=ax[i])
    ;

#### How does the bin size affect your plot?

In [None]:
"""
We are losing some information when the plots are too less.
"""

#### Use seaborn to show the distribution of column `Age`.

In [None]:
sns.histplot(x=titanic.Age);

#### Use the right plot to visualize column `Gender`. There are 2 ways of doing it. Do it both ways.
**Hint**: Use matplotlib and seaborn.

In [None]:
# Method 1 - matplotlib
titanic.Gender.value_counts().plot(kind='bar');

In [None]:
repre4 = titanic.Gender.value_counts()

sns.barplot(x=repre4.index, y=repre4,
              palette='magma');

#### Use the right plot to visualize the column `Pclass`.

In [None]:
repre5 = titanic.Pclass.value_counts()
sns.barplot(x=repre5.index, y=repre5,
              palette='flare');

#### We would like to have in one single plot the summary statistics of the feature `Age`. What kind of plot would you use? Plot it. 

In [None]:
sns.boxplot(data=titanic.Age);

In [None]:
"""
En el bloxplot vemos max, min, IQR, media, outliers...
"""

#### What does the last plot tell you about the feature `Age`?

In [None]:
"""
La media está por debajo de los 30, la mayoría de las personas están entre los 20 y los 37, el max en los 80, los de 
60+ son outliers...
""""

#### Now in addition to the summary statistics, we want to have in the same plot the distribution of `Age`. What kind of plot would you use? Plot it. 

In [None]:
sns.violinplot(data=titanic.Age, palette='flare');

#### What additional information does the last plot provide about feature `Age`?

In [None]:
"""
Que la mayor cantidad de gente se agrupa en las edades entre 20 y 30 años, que a mayor edad menos cantidad de gente, 
que había muy pocos adolescentes...
"""

#### We suspect that there is a linear relationship between `Fare` and `Age`. Use the right plot to show the relationship between these 2 features. There are 2 ways, please do it both ways.
**Hint**: Use matplotlib and seaborn.

In [None]:
plt.matshow(titanic.corr());

In [None]:
corr.style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
titanic.columns

In [None]:
titanic.plot.scatter(x='Age', y='Fare');

In [None]:
sns.scatterplot(data=titanic, x='Age', y='Fare');

#### Plot the correlation matrix using seaborn.

In [None]:
corr=titanic.corr(method='pearson')

In [None]:
sns.set(style='white')     # estilo blanco


mascara=np.triu(np.ones_like(corr, dtype=bool))   # genera una mascara para tapar valores


cmap=sns.diverging_palette(0, 10, as_cmap=True)   # paleta de colores


sns.heatmap(corr,
            mask=mascara,
            cmap=cmap,
            vmax=1,
            center=0,
            square=True,
            linewidth=0.5,
            cbar_kws={'shrink': 0.5},
            annot=True
           );

#### What are the most correlated features?

In [None]:
"""
Fare con Pclass, luego Age con Pclass y luego Survived con Pclass.
"""

#### Use the most appropriate plot to display the summary statistics of `Age` depending on `Pclass`.

In [None]:
sns.boxplot(data=titanic, x='Pclass', y='Age');

#### Use seaborn to plot the distribution of `Age` based on the `Gender`.
**Hint**: Use Facetgrid.

In [None]:
sns.boxplot(data=titanic, x='Gender', y='Age');