# Introduction to Data Analysis in Python

## DataFrame 
### A DataFrame is a two-dimensional (or possibly more) table of data. Think of each column in the table as Series

In [1]:
from pandas import DataFrame 

#### There are many ways of creating a DataFrame but if you already have your data in python then the simplest way is by passing a dictionary

In [5]:
data = {
    "city": ["Paris", "Paris", "Paris", "Paris",
             "London", "London", "London", "London",
             "Rome", "Rome", "Rome", "Rome",],
    "Year": [2001, 2008, 2009, 2010,
             2001, 2006, 2011, 2015,
             2001, 2006, 2009, 2012],
    "pop": [2.148, 2.211, 2.234, 2.244,
            7.322, 7.657, 8.174, 8.615,
            2.547, 2.627, 2.734, 2.627]
}
census = DataFrame(data)

### Dictionary with a key of type string and value is a list of items

In [6]:
census # Index is not specified so it becomes the default whch is integers from 0

Unnamed: 0,city,Year,pop
0,Paris,2001,2.148
1,Paris,2008,2.211
2,Paris,2009,2.234
3,Paris,2010,2.244
4,London,2001,7.322
5,London,2006,7.657
6,London,2011,8.174
7,London,2015,8.615
8,Rome,2001,2.547
9,Rome,2006,2.627


In [8]:
census.head(3)# to get a peak of top 

Unnamed: 0,city,Year,pop
0,Paris,2001,2.148
1,Paris,2008,2.211
2,Paris,2009,2.234


In [9]:
census.tail(3) # to get a peak of bottom

Unnamed: 0,city,Year,pop
9,Rome,2006,2.627
10,Rome,2009,2.734
11,Rome,2012,2.627


In [11]:
census["city"] # dataframes select by column, series by row

0      Paris
1      Paris
2      Paris
3      Paris
4     London
5     London
6     London
7     London
8       Rome
9       Rome
10      Rome
11      Rome
Name: city, dtype: object

In [12]:
type(census["city"] ) # check type of it by wrapping parenthesis around it

pandas.core.series.Series

In [13]:
census["city"] == "Paris" # Select all rows where city Paris is true

0      True
1      True
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
Name: city, dtype: bool

In [14]:
census[   census["city"] == "Paris"   ] # outside square brackets are asking the question, inside select the answer for the question

Unnamed: 0,city,Year,pop
0,Paris,2001,2.148
1,Paris,2008,2.211
2,Paris,2009,2.234
3,Paris,2010,2.244


In [15]:
census.loc[3] # select by row location/index

city    Paris
Year     2010
pop     2.244
Name: 3, dtype: object

In [16]:
census["Continental"] =  census["city"] != "London" # Add new columns which returns a boolean value

In [17]:
census

Unnamed: 0,city,Year,pop,Continental
0,Paris,2001,2.148,True
1,Paris,2008,2.211,True
2,Paris,2009,2.234,True
3,Paris,2010,2.244,True
4,London,2001,7.322,False
5,London,2006,7.657,False
6,London,2011,8.174,False
7,London,2015,8.615,False
8,Rome,2001,2.547,True
9,Rome,2006,2.627,True


In [18]:
del census["Continental"]

In [19]:
census

Unnamed: 0,city,Year,pop
0,Paris,2001,2.148
1,Paris,2008,2.211
2,Paris,2009,2.234
3,Paris,2010,2.244
4,London,2001,7.322
5,London,2006,7.657
6,London,2011,8.174
7,London,2015,8.615
8,Rome,2001,2.547
9,Rome,2006,2.627


In [22]:
census[  census["pop"] < 2.2 ]

Unnamed: 0,city,Year,pop
0,Paris,2001,2.148


### Exercise
#### - Create a DataFrame containing the census data for three cities
#### - Select the data for the year 2001. Which city had the smallest population that year?
#### - Find all the cities which had a population smaller than 2.6 milliiom

In [25]:
census[ census["Year"] == 2001 ]

Unnamed: 0,city,Year,pop
0,Paris,2001,2.148
4,London,2001,7.322
8,Rome,2001,2.547


In [28]:
pop = census[ census["Year"] == 2001 ]["pop"]
pop 

0    2.148
4    7.322
8    2.547
Name: pop, dtype: float64

In [34]:
pop.idxmin() # to determine the smallest index value use the idxmin function

0

In [36]:
census["city"][pop.idxmin()]

'Paris'

In [26]:
census["city"][   census["pop"] < 2.6  ]

0    Paris
1    Paris
2    Paris
3    Paris
8     Rome
Name: city, dtype: object