## pandas-profiling Meteorites example
Source of data: https://data.nasa.gov/Space-Science/Meteorite-Landings/gh4g-9sfh

### Import libraries

In [None]:
import pandas as pd
import pandas_profiling
import numpy as np

### Load and prepare example dataset
We add some fake variables for illustrating pandas-profiling capabilities

In [None]:
df=pd.read_csv("examples/Meteorite_Landings.csv", parse_dates=['year'], encoding='UTF-8')

# Note: Pandas does not support dates before 1880, so we ignore these for this analysis
df['year'] = pd.to_datetime(df['year'], errors='coerce')

# Example: Constant variable
df['source'] = "NASA"

# Example: Boolean variable
df['boolean'] = np.random.choice([True, False], df.shape[0])

# Example: Mixed with base types
df['mixed'] = np.random.choice([1, "A"], df.shape[0])

# Example: Highly correlated variables
df['reclat_city'] = df['reclat'] + np.random.normal(scale=5,size=(len(df)))

# Example: Duplicate observations
duplicates_to_add = pd.DataFrame(df.iloc[0:10])
duplicates_to_add[u'name'] = duplicates_to_add[u'name'] + " copy"

df = df.append(duplicates_to_add, ignore_index=True)

### Inline report without saving object

In [None]:
pandas_profiling.ProfileReport(df)

### Save report to file

In [None]:
pfr = pandas_profiling.ProfileReport(df)
pfr.to_file("/tmp/example.html")

#### Print existing ProfileReport object inline

In [None]:
pfr

In [2]:
import pandas as pd
import pandas_profiling
import numpy as np
df = pd.DataFrame({'porta': ['duas', 'quatro', 'duas', 'quatro', 'quatro', 'duas', 'duas'],
                    'combustivel': ['gas', 'diesel', 'gas', 'gas', 'diesel', 'diesel', 'gas'],
                   'body-style': ['sedan', 'hatch', 'hatch', 'sedan', 'sedan', 'hatch', 'hatch']
                    })



pfr = pandas_profiling.ProfileReport(df)
pfr

  variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)


{'porta': {'values': ['duas', 'quatro'], 'start_i': 0, 'end_i': 2}, 'combustivel': {'values': ['gas', 'diesel'], 'start_i': 2, 'end_i': 4}, 'body-style': {'values': ['sedan', 'hatch'], 'start_i': 4, 'end_i': 6}} [[0.0, 0.0, 3.0, 1.0, 1.0, 3.0], [0.0, 0.0, 1.0, 2.0, 2.0, 1.0], [3.0, 1.0, 0.0, 0.0, 2.0, 2.0], [1.0, 2.0, 0.0, 0.0, 1.0, 2.0], [1.0, 2.0, 2.0, 1.0, 0.0, 0.0], [3.0, 1.0, 2.0, 2.0, 0.0, 0.0]]


0,1
Total size in memory,296.0 B
Average record size in memory,42.3 B

0,1
Distinct count,2
Unique (%),28.6%
Missing (%),0.0%
Missing (n),0

0,1
duas,4
quatro,3

Value,Count,Frequency (%),Unnamed: 3
duas,4,57.1%,
quatro,3,42.9%,

0,1
Distinct count,2
Unique (%),28.6%
Missing (%),0.0%
Missing (n),0

0,1
gas,4
diesel,3

Value,Count,Frequency (%),Unnamed: 3
gas,4,57.1%,
diesel,3,42.9%,

0,1
Distinct count,2
Unique (%),28.6%
Missing (%),0.0%
Missing (n),0

0,1
hatch,4
sedan,3

Value,Count,Frequency (%),Unnamed: 3
hatch,4,57.1%,
sedan,3,42.9%,

Unnamed: 0,porta,combustivel,body-style
0,duas,gas,sedan
1,quatro,diesel,hatch
2,duas,gas,hatch
3,quatro,gas,sedan
4,quatro,diesel,sedan


In [1]:
%%javascript
require.config({
paths: {
d3: "https://d3js.org/d3.v5.min"
}
});

require(["d3"], function(d3) {
window.d3 = d3;
});

<IPython.core.display.Javascript object>