In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def importtitanic():
    df_titanic = pd.read_csv('titanic.csv')
    
    df_titanic['Survived'] = df_titanic['Survived'].astype(str)
    df_titanic['PassengerId'] = df_titanic['PassengerId'].astype(str)
    df_titanic['Pclass'] = df_titanic['Pclass'].astype(str)
    
    df_titanic.dropna(subset=['Age','Embarked'], inplace=True)
    df_titanic.drop('Cabin', axis=1, inplace=True)
    
    return df_titanic

In [None]:
titanic = importtitanic()

In [None]:
titanic.info()

In [None]:
titanic.describe(include='all')

## Last week's Tableau plots, now in Python

### Fare box plot

In [None]:
titanic['Fare'].plot(kind='box')

In [None]:
titanic.loc[titanic['Fare'] < 100, ['Fare']].plot(kind='box')

In [None]:
t_sub100 = titanic.loc[titanic['Fare'] < 100, ['Fare']].copy()

In [None]:
t_sub100['Fare'].plot(kind='hist')

In [None]:
t_sub100['Fare'].plot(kind='hist', bins=int(100/3))

In [None]:
titanic['Pclass'].value_counts()

In [None]:
titanic['Pclass'].value_counts().plot(kind='pie')

In [None]:
titanic['Pclass'].value_counts().plot(kind='pie',autopct="%.2f%%")

In [None]:
titanic['Pclass'].value_counts().plot(kind='bar')

In [None]:
titanic['Pclass'].value_counts()

In [None]:
titanic['Pclass'].value_counts()[['1','2','3']]

In [None]:
titanic['Pclass'].value_counts()[['1','2','3']].plot(kind='bar')

In [None]:
ax = titanic['Pclass'].value_counts()[['1','2','3']].plot(kind='bar')
for bar in ax.patches:
  height = bar.get_height()
  label_x_pos = bar.get_x() + bar.get_width() / 2
  ax.text(label_x_pos, height, s=f'{height}', ha='center',
  va='bottom')

In [None]:
ax = titanic['Pclass'].value_counts()[['1','2','3']].plot(kind='bar',color='red')
for bar in ax.patches:
  height = bar.get_height()
  label_x_pos = bar.get_x() + bar.get_width() / 2
  ax.text(label_x_pos, height, s=f'{height}', ha='center',
  va='bottom', fontsize=16)

## Two categories

Let's look at a second category too

In [None]:
titanic[['Pclass','Sex']].value_counts().plot(kind='bar')

### going to be easier to use seaborn here

In [None]:
sns.countplot(data=titanic, x='Pclass')

In [None]:
ax = sns.countplot(data=titanic, x='Pclass')
vals = titanic['Pclass'].value_counts(ascending=False).values
ax.bar_label(container=ax.containers[0], labels=vals)

In [None]:
sns.countplot(data=titanic, x='Pclass', hue='Sex')

In [None]:
sns.countplot(data=titanic, x='Pclass', hue='Sex', dodge=False)

In [None]:
titanic['Pclass'].value_counts().reset_index()

In [None]:
sns.barplot(data=titanic['Pclass'].value_counts().reset_index(),
            x='index',
            y='Pclass')

In [None]:
g = sns.FacetGrid(titanic, row="Sex")
g.map_dataframe(sns.countplot, x="Pclass")

In [None]:
g = sns.FacetGrid(titanic, col="Sex")
g.map_dataframe(sns.countplot, x="Pclass")

In [None]:
g = sns.FacetGrid(titanic, col="Sex", hue='Sex')
g.map_dataframe(sns.countplot, x="Pclass", order=['1','2','3'])

In [None]:
sns.catplot(data=titanic,
            x='Pclass',
            col='Sex',
           kind='count')

In [None]:
sns.catplot(data=titanic,
            x='Pclass',
            col='Sex',
            kind='count',
            hue='Sex')

How about a stacked bar plot?

In [None]:
t2 = titanic[['Pclass','Sex']].value_counts().reset_index(name='counts')

In [None]:
t2

In [None]:
t2.pivot(index='Pclass',columns='Sex',values='counts')

In [None]:
t2p = t2.pivot(index='Pclass',columns=['Sex'],values='counts')
t2p.plot(kind = 'bar', stacked=True)

Alternatively...

In [None]:
pd.crosstab(titanic['Pclass'], titanic['Sex'])

In [None]:
pd.crosstab(titanic['Pclass'], titanic['Sex']).plot(kind='bar',
                                                    stacked=True)

## Dealing with an overwhelming set of labels

In [None]:
titanic['Ticket'][:2]

In [None]:
titanic['Ticket'].value_counts().plot(kind='bar')

In [None]:
titanic['Ticket'].value_counts(ascending=True).plot(kind='barh')

In [None]:
titanic['Ticket'][:20].value_counts(ascending=True).plot(kind='barh')

In [None]:
titanic['Ticket'][-20:].value_counts(ascending=True).plot(kind='barh')

In [None]:
titanic['Ticket'].value_counts(ascending=True)[:20].plot(kind='barh')

In [None]:
titanic['Ticket'].value_counts(ascending=True)[-20:].plot(kind='barh')

## Color for highlighting

In [None]:
c = ['blue' for i in range(20)]
c[-4:-1] = ['red' for i in range(3)]

In [None]:
c

In [None]:
titanic['Ticket'].value_counts(ascending=True)[-20:].plot(kind='barh',
                                                          color=c)

## Lollipop

In [None]:
v = titanic['Ticket'].value_counts(ascending=True)[-20:]
my_range=range(1,len(v.index)+1)

In [None]:
v

In [None]:
v.values

In [None]:
# The horizontal plot is made using the hline function
plt.hlines(y=my_range, xmin=0, xmax=v.values, color='skyblue')
plt.plot(v.values, my_range, "o")

In [None]:
# The horizontal plot is made using the hline function
plt.hlines(y=my_range, xmin=0, xmax=v.values, color='skyblue')
plt.plot(v.values, my_range, "o")
 
# Add titles and axis names
plt.yticks(my_range, v.index)
plt.title("A vertical lolipop plot", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Ticket')

# Show the plot
plt.show()

## Dot plot

In [None]:
# The horizontal plot is made using the hline function
#plt.hlines(y=my_range, xmin=0, xmax=v.values, color='skyblue')
plt.plot(v.values, my_range, "o")
 
# Add titles and axis names
plt.yticks(my_range, v.index)
plt.title("A vertical dot plot", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Ticket')

# Show the plot
plt.show()

In [None]:
# The horizontal plot is made using the hline function
#plt.hlines(y=my_range, xmin=0, xmax=v.values, color='skyblue')
plt.plot(v.values, my_range, "o")

plt.vlines(x=4.5, ymin=0, ymax=max(my_range), 
           color='red', linestyle='--')
plt.text(x=4.6, y=5, s='guess at mean')

# Add titles and axis names
plt.yticks(my_range, v.index)
plt.title("A vertical dot plot", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Ticket')

# Show the plot
plt.show()

In [None]:
# The horizontal plot is made using the hline function
plt.hlines(y=my_range, 
           xmin=[min(0,i-4.5) for i in v.values],
           xmax=[max(0,i-4.5) for i in v.values],
           color='skyblue')
plt.plot(v.values-4.5, my_range, "o")
 
# Add titles and axis names
plt.yticks(my_range, v.index)
plt.title("A vertical lolipop plot", loc='left')
plt.xlabel('Value of the variable')
plt.ylabel('Ticket')

# Show the plot
plt.show()

In [None]:
tf = titanic['Ticket'].value_counts(ascending=True)[-20:]
tf = tf - 4.5
tf.plot(kind='barh', color=c)

## Dueling histograms

In [None]:
sns.histplot(data=titanic, x='Age')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', multiple='stack')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', kde=True)

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', kde=True, multiple='stack')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', multiple='dodge')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', multiple='fill')

In [None]:
sns.histplot(data=titanic, x='Age', hue='Pclass', multiple='layer')

In [None]:
sns.kdeplot(data=titanic, x='Age', hue='Pclass')

In [None]:
sns.kdeplot(data=titanic, x='Age', hue='Pclass', bw_adjust=.5)

In [None]:
sns.boxplot(data=titanic, x='Pclass', y='Age')

In [None]:
sns.boxplot(data=titanic, x='Age', y='Pclass')

In [None]:
#sns.set_theme(rc={"axes.facecolor": (0, 0, 0, 0)})
g = sns.FacetGrid(titanic, row="Pclass", aspect=5, height=3)
g.map_dataframe(sns.kdeplot, x="Age", fill=True, alpha=.2)
g.fig.subplots_adjust(hspace=-.5)

In [None]:
sns.scatterplot(data=titanic.loc[titanic['Fare']<100], x='Age', y='Fare')

In [None]:
import numpy as np

In [None]:
t_num = titanic.select_dtypes(include=np.number).copy()

In [None]:
sns.pairplot(t_num)

In [None]:
titanic.plot(x='Parch', y='Age', kind='scatter')

In [None]:
titanic.plot(x='Parch', y='Age', kind='hexbin')

In [None]:
titanic.loc[(titanic['SibSp']>0)].plot(x='Parch', y='SibSp', kind='hexbin')

In [None]:
tf

In [None]:
tf = np.log(titanic.loc[(titanic['SibSp']>0)].select_dtypes(include=np.number)+1e-12)
tf.plot(x='Parch', y='SibSp', kind='hexbin')

In [None]:
sns.lmplot(data=titanic, x='Parch', y='Age')

# Penguins dataset

https://allisonhorst.github.io/palmerpenguins/

In [None]:
import seaborn as sns

In [None]:
penguins = sns.load_dataset("penguins")

In [None]:
penguins.info()

In [None]:
sns.histplot(data=penguins, 
             x="flipper_length_mm")

In [None]:
sns.histplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species")

In [None]:
sns.histplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species", 
             multiple="stack")

In [None]:
sns.kdeplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species", 
             multiple="stack")

In [None]:
g = sns.FacetGrid(penguins, col="species")
g.map_dataframe(sns.kdeplot, x="flipper_length_mm")

In [None]:
g = sns.FacetGrid(penguins, col="species")
g.map_dataframe(sns.histplot, x="flipper_length_mm")

In [None]:
sns.catplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species",
            col="species")

In [None]:
sns.displot(data=penguins, 
             x="flipper_length_mm", 
             hue="species",
            col="species")

In [None]:
sns.pairplot(data=penguins)

In [None]:
sns.scatterplot(data=penguins, x='bill_length_mm', y='bill_depth_mm')

In [None]:
sns.scatterplot(data=penguins, x='body_mass_g', y='flipper_length_mm')

In [None]:
penguins.corr()

In [None]:
sns.pairplot(data=penguins)

In [None]:
sns.pairplot(data=penguins, hue='species')

Something seems fishy with the negative correlation

In [None]:
sns.jointplot(data=penguins,
              x="bill_depth_mm", 
              y='flipper_length_mm',
              hue="species")

In [None]:
sns.displot(data=penguins,
            x="flipper_length_mm",
            hue="species",
            col="species")

In [None]:
sns.displot(data=penguins, 
             x="bill_depth_mm", 
             hue="species",
            col="species")

In [None]:
penguins[['species','island']]

In [None]:
penguins.groupby(['species'])['island'].value_counts().reset_index(name='counts')

In [None]:
penguins.groupby(['species'])['island'].value_counts().reset_index(name='counts').pivot(index='species',columns='island')

In [None]:
tf = penguins.groupby(['species'])['island'].value_counts().reset_index(name='counts').pivot(index='species',columns='island')

In [None]:
tf.fillna(0,inplace=True)

In [None]:
tf

In [None]:
sns.heatmap(tf)

In [None]:
sns.boxplot(data=penguins, x='species', y='flipper_length_mm')

In [None]:
sns.boxplot(data=penguins, x='island', y='flipper_length_mm')

In [None]:
sns.boxplot(data=penguins, y='island', x='flipper_length_mm')

In [None]:
sns.boxplot(data=penguins, y='species', x='flipper_length_mm')

In [None]:
g = sns.FacetGrid(penguins, row="species", aspect=5, height=2)
g.map_dataframe(sns.histplot, x="flipper_length_mm", fill=True, alpha=.2)

In [None]:
penguins.to_csv('penguins.csv')