In [None]:
"""
Data Analysis and Visualization
"""

## Data Analysis 
Is the process of 
* collecting and 
* organizing data 
* in order to draw helpful *conclusions* from it. 

The main purpose is to find meaning in data that the derived knowledge can be used to make informed decisions.

# Pandas
* It has two fundamental data structure/objects
    - Series
    - Data Frames
        - Testing testing
* **Series** : stores single column data along with an index. Just a column
* **Data Frames:** is a collection of series

In [1]:
import pandas as pd
pd.__version__

'0.25.1'

In [15]:
l =[2,4,5,7]
pd.Series(l)

0    2
1    4
2    5
3    7
dtype: int64

In [13]:
data = pd.Series([1, 1, 2, 3, 5, 8, 12])
data

0     1
1     1
2     2
3     3
4     5
5     8
6    12
dtype: int64

In [14]:
data.values

array([ 1,  1,  2,  3,  5,  8, 12], dtype=int64)

In [16]:
data.index

RangeIndex(start=0, stop=7, step=1)

### 2.1.2 Series as specialized dictionaries
A dictionary is a structure that maps arbitrary keys to a set of arbitrary values, and a Series is a structure which maps typed keys to a set of typed values. We can take advantage of these similarities to create a Series from a dictionary, where the keys are the indices of the Series and the values are those associated with these indices.

In [2]:
details = {
    "Name": ["Joshua", "Precious", "Praise", "James"],
    "Age": [12,18,22,17],
    "Level": [400,300,400,100],
}
pd.Series(details)

Name     [Joshua, Precious, Praise, James]
Age                       [12, 18, 22, 17]
Level                 [400, 300, 400, 100]
dtype: object

In [7]:
new_df = pd.DataFrame(details, index=('1', '2', '3','4'))
# new_df.to_csv("JustPlaying")
new_df # The index can be anything

Unnamed: 0,Name,Age,Level
1,Joshua,12,400
2,Precious,18,300
3,Praise,22,400
4,James,17,100


In [8]:
new_df.columns

Index(['Name', 'Age', 'Level'], dtype='object')

# Attributes

1. DataFrame.head() - returns the content of the first 5 rows
2. DataFrame.tail() - returns the content of the last 5 rows
3. DataFrame.shape - returns a tuple of the form (num_rows, num_columns)
4. DataFrame.columns - returns the name of the columns
5. DataFrame.index() - retruns the index of the rows

In [9]:
new_df.tail()

Unnamed: 0,Name,Age,Level
1,Joshua,12,400
2,Precious,18,300
3,Praise,22,400
4,James,17,100


In [12]:
new_df.index

Index(['1', '2', '3', '4'], dtype='object')

# 3. Data Indexing and Selection


### 3.1 Data Selection in Series

In [14]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a","b","c","d"])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [16]:
data["a"]

0.25

In [17]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

Note: When slicing an explicit index e.g data["a":"c"], the c is included in the result but for implicit index e.g data[2:4], the last digit is not included

#### 3.2.1.  *loc* attribute
Allows indexing and slicing using the explicit index

In [18]:
data.loc['a']

0.25

### 3.2.2. *iloc* attribute
Allows indexing and slicing using the implicit index

In [19]:
data.iloc[1:3]

b    0.50
c    0.75
dtype: float64

## Exercise 1

Consider the following lists

lst1 = [1, 2, 3, 5, 8]

lst2 = [8, 5, 3, 2, 1]
1. Create and display two individual Series objects s1 and s2 from the data available on each list.
2. Perform the following operations with the two series (element-wise):
    * Add s1 and s2 and store the result in a new variable s3_add
    * Subtract s2 from s1 and store the result in a new variable s3_sub
    * Multiply s1 and s2 and store the result in a new variable s3_mul
    * Divide s1 by s2 and store the result in a new variable s3_div

In [20]:
lst1 = [1, 2, 3, 5, 8]
lst2 = [8, 5, 3, 2, 1]

In [22]:
s1 = pd.Series(lst1)
s1

0    1
1    2
2    3
3    5
4    8
dtype: int64

In [23]:
s2 = pd.Series(lst2)
s2

0    8
1    5
2    3
3    2
4    1
dtype: int64

In [25]:
s3_add = s1 + s2
s3_add

0    9
1    7
2    6
3    7
4    9
dtype: int64

# Exercise 2

Consider the following Series object:


|Index|Reviews|
|--|--|
|0|45000|
|1|37872|
|2|57923|
|3|68979|
|4|78934|
|5|69897|
|6|56701|

Name: Amazon_Reviews, dtype: int64
1. Create and display the Amazon_Reviews Series.

2. Get the last three values from Amazon_Reviews using negative indexing

In [41]:
amazon = pd.Series([45000,37872,57923,68979,78934,69897,56701])
amazon

0    45000
1    37872
2    57923
3    68979
4    78934
5    69897
6    56701
dtype: int64

In [45]:
amazon[-3:]

4    78934
5    69897
6    56701
dtype: int64

## Exercise 3
Consider the following dictionary which is relating the area in sq units of some USA states:

    area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
             
1. Create a Series using the given dictionary
2. Extract areas for 'Texas', 'New York', and 'Florida' from the created series

In [46]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
print(area_dict)

{'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}


In [55]:
series=pd.Series(area_dict)
series

In [57]:
series.loc["Texas":"Florida"]

Texas       695662
New York    141297
Florida     170312
dtype: int64


### 3.3. Data Selection in DataFrame
Let's see in detail how to access the elements of the **DataFrame** objects, redefine a DataFrame object for explanatory purposes:

In [3]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})

pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})

data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [68]:
data["pop"]

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [22]:
data.area # Equivalent to data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

How to create a new column

In [23]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


As you can see, when the Series 'pop' is accessed and divided over the Series 'area', the arithmetic operation becomes element-wise and the result is assigned to the new Series 'density', which becomes the third column of the **DataFrame** data

In [7]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

## 3.4. Indexers: loc, iloc for DataFrame
We are going to review two ways to access the elements of a DataFrame, using two attributes determined for this: .loc[] and .iloc[]

## Exercise 4
Consider below DPhi Bootcamp's information about different batches:

Total_Candidates = {'absolute_beginners': 785, 'beginners': 825, 'intermediat_advanced': 602} # this is true data

Active_Candidates = {'absolute_beginners': 500, 'beginners': 425, 'intermediat_advanced': 300}  # this is hypothetical data

1. Create a Pandas DataFrame using above information (name your Dataframe as DPhi)
2. Get all the columns in DPhi.
3. Get the information of total candidates present in each batches using dictionary-style indexing.
4. Find the number of candidates for each batches who are not active and add this information to the dataframe DPhi.
5. Also, find the percent of candidates that are active in each batches and add this information to the DPhi dataframe (hint: $percent = (active / total)* 100$)
6. Get all the batches where percentage of active candidates are greater than 60%

In [8]:
Total_Candidates = {'absolute_beginners': 785, 'beginners': 825, 'intermediat_advanced': 602}

Active_Candidates = {'absolute_beginners': 500, 'beginners': 425, 'intermediat_advanced': 300} 

In [11]:
DPhi = pd.DataFrame({"Total_Candidates":Total_Candidates, 'Active_Candidates':Active_Candidates})
DPhi

Unnamed: 0,Total_Candidates,Active_Candidates
absolute_beginners,785,500
beginners,825,425
intermediat_advanced,602,300


In [12]:
DPhi.columns

Index(['Total_Candidates', 'Active_Candidates'], dtype='object')

In [13]:
DPhi.Total_Candidates

absolute_beginners      785
beginners               825
intermediat_advanced    602
Name: Total_Candidates, dtype: int64

In [15]:
DPhi["Not_Active"] = DPhi["Total_Candidates"] - DPhi["Active_Candidates"]
DPhi

Unnamed: 0,Total_Candidates,Active_Candidates,Nor_Active,Not_Active
absolute_beginners,785,500,285,285
beginners,825,425,400,400
intermediat_advanced,602,300,302,302


In [18]:
DPhi["Active_percent"] = (DPhi["Active_Candidates"] / DPhi["Total_Candidates"]) * 100
DPhi

Unnamed: 0,Total_Candidates,Active_Candidates,Nor_Active,Not_Active,Active_percent
absolute_beginners,785,500,285,285,63.694268
beginners,825,425,400,400,51.515152
intermediat_advanced,602,300,302,302,49.833887


In [21]:
DPhi["Active_percent"] > 60

absolute_beginners       True
beginners               False
intermediat_advanced    False
Name: Active_percent, dtype: bool


## 3.5. Subsetting a Dataframe
*Subsetting* a DataFrame is a way of filtering which allows to extract portions of interest. Subsetting can be done using comparison operators and logical operators inside a pair of square brackets [] as shown in the following example:

In [24]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [32]:
data[data.index == "California"]

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926


In [31]:
data.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

###### Use the & sign for 'and' and the '|' sign for 'or'

In [33]:
data[(data['density'] < 90) | (data['density'] > 120)]

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Illinois,149995,12882135,85.883763


TypeError: check() missing 1 required positional argument: 'name'

## 4. Data Wrangling
The difference between data found in many tutorials and data from the real world is that real-world data is rarely clean and homogeneous. In particular, many interesting datasets will have some amount of data missing. To make matters even more complicated, different data sources may indicate missing data in different ways.

In this way, we need to define methods that allow us to structure, clean and enrich the data acquired from the real world, which are the main steps for Data Wrangling. Before continuing, let's see what is the difference between these three steps and expand their definition:

1. **Data structuring:**

The first step in the data wrangling process is to separate the relevant data into multiple columns, so that the analysis can be run grouping by common values in a separate way. In turn, if there are columns that are not desired or that will not be relevant to the analysis, this is the phase to filter the data or mix together some of their columns.

2. **Data Cleaning**

In this step, the data is cleaned up for high-quality analysis. Null values are handled, and the data format is standardized. We will enter this process in the following weeks.

3. **Data Enriching**

After cleaning, the data is enriched by increasing some variables in what is known as *Data Augmentation* and using additional sources to enrich them for the following stages of processing.

For now, we will review how to handle missing values, a fundamental step for data cleaning.

## 5. Handling Missing Data
This is a fundamental step in data cleaning. It is common that during the data acquisition processes, there are lost records, either due to the difficulties of acquiring them, due to errors in the source or destination, or because we simply could not acquire the data. There are three types of missing data:

- Missing completely at random (MCAR): when the fact that the data is missing is independent of the observed and unobserved data.
- Missing at random (MAR): when the fact that the data is missing is systematically related to the observed but not the unobserved data.
- Missing not at random (MNAR): when the missingness of data is related to events or factors which are not measured by the researcher.

We will go into these types in detail later. For now, we'll look at the fundamentals of handling missing data in pandas:

### 5.1. NaN and None in Pandas
Missing data is handled in Pandas as *NaN* values placeholders. NaN value is a IEEE 754 floating point representation of Not a Number (NaN). One of the main reasons to handle missing data as NaN rather than Null in Pandas is that NaN (from np.nan) allows for vectorized operations, since it is a float value. None, by definition, forces object type, which basically disables all efficiency in Numpy and Pandas.

NaN and None are handled nearly interchangeably by Pandas, converting between them where appropriate:

In [39]:
import numpy as np

pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [40]:
np.nan

nan

### 5.2. Operations on Missing Values
There are several useful methods for detecting, removing, and replacing missing values in Pandas data structures:

* isnull(): generates a boolean mask indicating missing values
* notnull(): generates a boolean mask of non-missing values. Is the opposite of isnull().
* dropna(): returns a filtered version of the data, without missing values.
* fillna(): returns a copy of the data with missing values filled or imputed with a desired strategy.

Let's review some examples of the first two functions $isnull()$ and $notnull()$:

In [52]:
data = pd.Series([1, np.nan, 'hello', None])
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [48]:
data.isnull() # returns True if its a real value

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [46]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [53]:
data[data.notnull()]

0        1
2    hello
dtype: object

### 5.3. Dropping missing values
The basic function to remove any missing values from a Series object is as follows, although the function is not executed inplace:

In [54]:
data.dropna()

0        1
2    hello
dtype: object

In [55]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

There are more options for DataFrame

In [56]:
df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [58]:
df.dropna() # it drops(deletes) every row with any missing value

Unnamed: 0,0,1,2
1,2.0,3.0,5


Alternatively, you can drop missing values along a different axis; axis=1 drops all columns containing a missing value:

In [59]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


But this drops some good data as well; you might rather be interested in dropping rows or columns with all *NaN* values, or a majority of NaN values. This can be specified through the *how* or *thresh* parameters, which allow fine control of the number of nulls to allow through.

The default is *how='any'*, such that any row or column (depending on the axis keyword) containing a null value will be dropped. You can also specify *how='all'*, which will only drop rows/columns that are all null values:

In [64]:
df = pd.DataFrame([[1, np.nan, 2, np.nan],
[2, 3, 5, np.nan],
[np.nan, 4, 6, np.nan]])
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [69]:
df.dropna(axis='columns', how='all') # the column with *all* missing value should be removed

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


### 5.4. Filling null values
Sometimes rather than dropping NaN values, you'd rather replace them with a valid value. This value might be a single number like zero, or it might be some sort of imputation or interpolation from the good values.

There are four types of treatment that can be given, in that order, to unwanted non-existent or missing data:

1. **Treatment 1:** Ignore the missing or unwanted data in some columns, considering that in other columns of the same rows there are important or relevant data for the study.
2. **Treatment 2:** Replace the missing or unwanted data with values that represent an indicator of nullity.
3. **Treatment 3:** Replace the missing, nonexistent or unwanted data with interpolated values that are related to the trend of the data that is present.
4. **Treatment 4:** Delete the missing data, with the certainty that valuable information will not be lost when analyzing the data.
    
You can apply **Treatment 2** and **Treatment 3** in-place using the $isnull()$ method as a mask, but because it is such a common operation Pandas provides the $fillna()$ method, which returns a copy of the array with the missing values replaced.

Consider the following Series:

In [70]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [72]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [73]:
# forward-fill
data.fillna(method='ffill') #forward-fill replace it with the one before

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [74]:
# back-fill
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

For DataFrame objects the options are similar, but we can also specify an axis along which the fills take place:

In [76]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [81]:
df.fillna(method='ffill', axis=0)
# Notice that if a previous value is not available 
# during a forward fill, the NaN value remains.

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


### 6. Pandas String Operations
When a **Pandas** object stores string data, Pandas provides certain operations to facilitate its manipulation. Let's see what would happen if a classic data storage structure like a list had missing data and a string operation was executed. Firstly, we define a list with four string values:

In [88]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']

In [85]:
[i.capitalize() for i in data] # without None

['Peter', 'Paul', 'Mary', 'Guido']

In [89]:
import pandas as pd
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [91]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

We have accessed the str attribute that parses the values stored in the Series to string.

### 6.1. String Methods
Here is a list of Pandas **str** methods that mirror Python string methods:

    len()	lower()	translate()	islower()
    ljust()	upper()	startswith()	isupper()
    rjust()	find()	endswith()	isnumeric()
    center()	rfind()	isalnum()	isdecimal()
    zfill()	index()	isalpha()	split()
    strip()	rindex()	isdigit()	rsplit()
    rstrip()	capitalize()	isspace()	partition()
    lstrip()	swapcase()	istitle()	rpartition()

Let's see some examples of string methods for Pandas **Series** with the **monte** Series:

In [92]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object

In [95]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [96]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

## Exercise 5
Consider the following lists:

country = ['Netherland', 'Germany', 'Peru', 'Israel', 'Madagascar']
year = [2002, 2002, 1957, 2007, 1967]
population = [16122830.0, np.nan, 9146100.0, 6426679.0, 6334556.0]
continent = ['Europe', 'europe', 'Americas', 'asia', 'Africa']

1. Create a Dataframe object which contains all the lists values as Series. The final DataFrame should be named as country_info, containing 4 columns and 5 rows.
2. Delete the rows which contains missing values
3. Capitalize all the continents in continent column.
4. Get the length of each country's names.

In [108]:
country = ['Netherland', 'Germany', 'Peru', 'Israel', 'Madagascar']
year = [2002, 2002, 1957, 2007, 1967]
population = [16122830.0, np.nan, 9146100.0, 6426679.0, 6334556.0]
continent = ['Europe', 'europe', 'Americas', 'asia', 'Africa']

In [102]:
country_info = pd.DataFrame({'country':country, "population":population, "continent":continent})
country_info

Unnamed: 0,country,population,continent
0,Netherland,16122830.0,Europe
1,Germany,,europe
2,Peru,9146100.0,Americas
3,Israel,6426679.0,asia
4,Madagascar,6334556.0,Africa


In [107]:
country_info.dropna()

Unnamed: 0,country,population,continent
0,Netherland,16122830.0,Europe
2,Peru,9146100.0,Americas
3,Israel,6426679.0,asia
4,Madagascar,6334556.0,Africa


In [115]:
pd.Series(continent).str.title()


0      Europe
1      Europe
2    Americas
3        Asia
4      Africa
dtype: object

Note: We have to convert the guys first to pandas series before we can use them with the .str because list can't be used with those special keys

In [116]:
pd.Series(country).str.len()

0    10
1     7
2     4
3     6
4    10
dtype: int64

### 7. Concatenate Series
Here we'll take a look at simple concatenation of Series and DataFrame objects with the pd.concat() function:

In [117]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2], axis=0)

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [118]:
pd.concat([ser1, ser2], axis=1)

Unnamed: 0,0,1
1,A,
2,B,
3,C,
4,,D
5,,E
6,,F


By default, the concatenation takes place row-wise within the DataFrame (i.e., axis=0). However, as you can see, the concatenation of Series in a DataFrame can be done in contiguous rows or columns by specifying the axis parameter. In the case where they are columns, care must be taken to define the same index values, so that the columns are placed contiguously without NaN values.


### Exercise 6
Consider the following lists:

country = ['Netherland', 'Germany', 'Peru', 'Israel', 'Madagascar']

gdp_per_cap = [33724.757780, 30035.801980, 4245.256698, 25523.277100, 1634.047282]

1. Create a Dataframe object which contains all the lists values as Series. The final DataFrame should be named as country_gdp, containing 2 columns and 5 rows.
2. Concatenate the two dataframes: country_info and country_gdp with axis=0 and name it concat_data
3. Check if there are any null values in concat_data
4. Find total numer of missing values in each column. hint: Use $.isnull()$ and $.sum()$ functions

In [120]:
country = ['Netherland', 'Germany', 'Peru', 'Israel', 'Madagascar']
gdp_per_cap = [33724.757780, 30035.801980, 4245.256698, 25523.277100, 1634.047282]


In [121]:
country_gdp = pd.DataFrame({'country':country, 'gdp_per_cap':gdp_per_cap})
country_gdp

Unnamed: 0,country,gdp_per_cap
0,Netherland,33724.75778
1,Germany,30035.80198
2,Peru,4245.256698
3,Israel,25523.2771
4,Madagascar,1634.047282


In [129]:
concat_data = pd.concat([pd.Series(country), pd.Series(gdp_per_cap)], axis=0)
concat_data

0    Netherland
1       Germany
2          Peru
3        Israel
4    Madagascar
0       33724.8
1       30035.8
2       4245.26
3       25523.3
4       1634.05
dtype: object

Note: we can't concatenate two lists. We have to first convert them to a pandas series

In [127]:
concat_data.isnull()

0    False
1    False
2    False
3    False
4    False
0    False
1    False
2    False
3    False
4    False
dtype: bool

In [128]:
concat_data.sum()

TypeError: can only concatenate str (not "float") to str

### 8. DataFrame fancy table printing
In the next two cells we are going to define a fancy way to visualize the data of multiple DataFrame objects. Let's first use the IPython.display library, which allows us to view the contents of DataFrame objects individually, in a table fancy way:

In [130]:
from IPython.display import display, HTML

display(pd.concat([ser1, ser2], axis=1))

Unnamed: 0,0,1
1,A,
2,B,
3,C,
4,,D
5,,E
6,,F


In [172]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args

    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
        for a in self.args)

    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
        for a in self.args)

In [173]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
    for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [177]:
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [178]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4
