# EXPLORATORY DATA ANALYSIS 1

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## TAKEN FROM https://stats.stackexchange.com/questions/403652/two-sample-quantile-quantile-plot-in-python
## Solution provided by user: Artem Mavrin
def qqplot(x, y, quantiles=None, interpolation='nearest', ax=None, rug=False,
           rug_length=0.05, rug_kwargs=None, **kwargs):
    """Draw a quantile-quantile plot for `x` versus `y`.

    Parameters
    ----------
    x, y : array-like
        One-dimensional numeric arrays.

    ax : matplotlib.axes.Axes, optional
        Axes on which to plot. If not provided, the current axes will be used.

    quantiles : int or array-like, optional
        Quantiles to include in the plot. This can be an array of quantiles, in
        which case only the specified quantiles of `x` and `y` will be plotted.
        If this is an int `n`, then the quantiles will be `n` evenly spaced
        points between 0 and 1. If this is None, then `min(len(x), len(y))`
        evenly spaced quantiles between 0 and 1 will be computed.

    interpolation : {‘linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’}
        Specify the interpolation method used to find quantiles when `quantiles`
        is an int or None. See the documentation for numpy.quantile().

    rug : bool, optional
        If True, draw a rug plot representing both samples on the horizontal and
        vertical axes. If False, no rug plot is drawn.

    rug_length : float in [0, 1], optional
        Specifies the length of the rug plot lines as a fraction of the total
        vertical or horizontal length.

    rug_kwargs : dict of keyword arguments
        Keyword arguments to pass to matplotlib.axes.Axes.axvline() and
        matplotlib.axes.Axes.axhline() when drawing rug plots.

    kwargs : dict of keyword arguments
        Keyword arguments to pass to matplotlib.axes.Axes.scatter() when drawing
        the q-q plot.
    """
    import numbers
    import numpy as np
    # Get current axes if none are provided
    if ax is None:
        ax = plt.gca()

    if quantiles is None:
        quantiles = min(len(x), len(y))

    # Compute quantiles of the two samples
    if isinstance(quantiles, numbers.Integral):
        quantiles = np.linspace(start=0, stop=1, num=int(quantiles))
    else:
        quantiles = np.atleast_1d(np.sort(quantiles))
    x_quantiles = np.quantile(x, quantiles, interpolation=interpolation)
    y_quantiles = np.quantile(y, quantiles, interpolation=interpolation)

    minimum=np.min([np.min(x),np.min(y)])
    maximum=np.max([np.max(x),np.max(y)])

    # Draw the rug plots if requested
    if rug:
        # Default rug plot settings
        rug_x_params = dict(ymin=0, ymax=rug_length, c='gray', alpha=0.5)
        rug_y_params = dict(xmin=0, xmax=rug_length, c='gray', alpha=0.5)

        # Override default setting by any user-specified settings
        if rug_kwargs is not None:
            rug_x_params.update(rug_kwargs)
            rug_y_params.update(rug_kwargs)

        # Draw the rug plots
        for point in x:
            ax.axvline(point, **rug_x_params)
        for point in y:
            ax.axhline(point, **rug_y_params)

    # Draw the q-q plot
    ax.scatter(x_quantiles, y_quantiles, **kwargs)
    ax.axline([minimum, minimum], [maximum, maximum], color='k')

Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Use Kaggle username and key

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = 'nnnnnnnnnnn'
os.environ['KAGGLE_KEY'] = 'kkkkkkkkk'


In [None]:
!pip install kaggle



In [None]:
os.chdir('/content/drive/MyDrive/path')

In [None]:
!kaggle competitions download -c titanic
!unzip /content/drive/MyDrive/Colab_Notebooks/BDA/titanic.zip

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  /content/drive/MyDrive/Colab_Notebooks/BDA/titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/path/train.csv')
df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Excercise:

* Consider the columns ['Survived',	'Pclass', 'Sex',	'Age',	'SibSp',	'Parch', 'Fare', Embarked']. Also said: ignore ['Ticket','Cabin'].
* Plot the distribution of the previous features using histograms (numerical, ordinal) or barplot (nominal or ordinal).  Mind that showing the barplot of nominal attributes requires first to get their value_counts distribution using: df['nominal_attribute'].value_counts().plot(kind='bar').
* Understand what may be the best option for measuring attribute centrality and dispersion (if needed) and compute them.
* Use a box plot to summarize the 'Fare' attribute. Do it again but grouping by the 'Survived' outcome. Does money buy safety (at least to some degree)?
* Split the dataset into Male and Female passengers and display a qq-plot of the 'Fare' attribute comparing the two groups. What does it mean? Is it possible to find a simple reason why this is the result or not?
* Plot a stratified barplot: the barplot should have on the x axis the 'Embarked' attribute and be stratified by the 'Pclass' attribute. Given that C = Cherbourg (France) , Q = Queenstown (Ireland), S = Southampton (UK); which city had the highest percentage of emigrants from the lower social classes?
* Check whether  the 'Survived' attribute is correlated with 'Pclass' or 'Sex'.
* Define a new attribute 'Children' (choose an Age threshold) and compute its correlation with 'Survival'.
* Check whether the 'Survived' attribute is correlated with 'Age', 'Fare', 'SibSp', or	'Parch'.
* Display a scatterplot between "Age" and "Fare" and compute correlation between them using multiple methods. Was the computation of correlation really needed in this case?

Hints:
* quantiles of an attribute ca be computed using df['Attribute'].quantile(q)
* stratified bar plot should not be directly applied to the dataframe. Think about the ingredients needed to create it and give a look that the pandas.crosstab function at https://pandas.pydata.org/docs/reference/api/pandas.crosstab.html
