# Summary Functions and Maps

In [3]:
import pandas as pd

def currency_to_int(curr):
    curr_list = curr[1:].split(",")
    price = int("".join(curr_list))
    return price

# transform prices
atl_addr = pd.read_csv("./atl-address-1.csv")
for i in range(len(atl_addr)):
    atl_addr.loc[i, "Price"] = currency_to_int(atl_addr.loc[i, "Price"])
    
atl_addr.head()

Unnamed: 0,Title,Price,Beds,Baths,Area
0,"34 The Prado NE, Atlanta, GA 30309",1495000,4 bds,4 ba,"3,644 sqft"
1,"2060 Shirley St SW, Atlanta, GA 30311",225000,3 bds,2 ba,"1,300 sqft"
2,"300 Peachtree St NE APT 11G, Atlanta, GA 30308",259000,2 bds,2 ba,890 sqft
3,"1690 Memorial Dr SE, Atlanta, GA 30317",320000,2 bds,1 ba,"1,163 sqft"
4,"6253 Old Kingston Dr, South Fulton, GA",349275,5 bds,3 ba,-- sqft


## Summary Functions
restructure the data in some useful way.

In [7]:
# get infomation about a certain column
# type-aware, meaning that its output changes based on the data type of the input.
atl_addr.Price.describe()

count         40
unique        36
top       550000
freq           2
Name: Price, dtype: int64

In [67]:
# get mean
atl_addr.Price.mean()

# get median
atl_addr.Price.median()

# get mode(众数)
atl_addr.Price.mode()

# get variance
atl_addr.Price.var()

3112307164953.6597

In [59]:
# get unique value
atl_addr.Beds.unique()

array(['4 bds', '3 bds', '2 bds', '5 bds', '1 bd', '6 bds', '7 bds',
       '8,002 sqft lot'], dtype=object)

In [10]:
# count frequency of different values
atl_addr.Beds.value_counts()

3 bds             12
2 bds              8
1 bd               6
4 bds              6
5 bds              5
7 bds              1
6 bds              1
8,002 sqft lot     1
Name: Beds, dtype: int64

## Maps
A function that takes one set of values and "maps" them to another set of values.

In [23]:
# remean some values(将均值修正为0)
origin_mean_price = atl_addr.Price.mean()

# return a new modified series, original DataFrame unchanged
# using map()
atl_addr.Price.map(lambda x: x - origin_mean_price).head(10)

0    773460.075
1   -496539.925
2   -462539.925
3   -401539.925
4   -372264.925
5   -436639.925
6   -372264.925
7   -471639.925
8   -263639.925
9    713460.075
Name: Price, dtype: float64

In [38]:
# using apply()
# could be used to change the DataFrame itself
def remean_price(row):
    row.Price = row.Price - origin_mean_price
    return row

atl_addr.apply(remean_price, axis = 1).head(10)

Unnamed: 0,Title,Price,Beds,Baths,Area
0,"34 The Prado NE, Atlanta, GA 30309",51920.15,4 bds,4 ba,"3,644 sqft"
1,"2060 Shirley St SW, Atlanta, GA 30311",-1218079.85,3 bds,2 ba,"1,300 sqft"
2,"300 Peachtree St NE APT 11G, Atlanta, GA 30308",-1184079.85,2 bds,2 ba,890 sqft
3,"1690 Memorial Dr SE, Atlanta, GA 30317",-1123079.85,2 bds,1 ba,"1,163 sqft"
4,"6253 Old Kingston Dr, South Fulton, GA",-1093804.85,5 bds,3 ba,-- sqft
5,"2870 Pharr South Court Ct NW # 205, Atlanta, G...",-1158179.85,2 bds,2 ba,-- sqft
6,"6253 Old Kingston Dr # 37, Atlanta, GA 30331",-1093804.85,5 bds,3 ba,-- sqft
7,"943 Peachtree St NE UNIT 1415, Atlanta, GA 30309",-1193179.85,1 bd,1 ba,768 sqft
8,"175 Wynfield Way SW, Atlanta, GA 30331",-985179.85,4 bds,4 ba,"3,624 sqft"
9,"2973 Margaret Mitchell Ct NW, Atlanta, GA 30327",-8079.85,5 bds,5 ba,"4,483 sqft"


In [34]:
atl_addr.Price.head()

0    773460
1   -496540
2   -462540
3   -401540
4   -372265
Name: Price, dtype: object

**Note that map() and apply() return new, transformed Series and DataFrames, respectively.** 

In [53]:
# pandas built-in mapping operations, much simpler
atl_addr = pd.read_csv("./atl-address-1.csv")
for i in range(len(atl_addr)):
    atl_addr.loc[i, "Price"] = currency_to_int(atl_addr.loc[i, "Price"])

# remean, using "-"
origin_mean_price = atl_addr.Price.mean()
new_prices = atl_addr.Price - origin_mean_price    # multiple values substract a single value
new_prices.head(10)

0    773460
1   -496540
2   -462540
3   -401540
4   -372265
5   -436640
6   -372265
7   -471640
8   -263640
9    713460
Name: Price, dtype: object

In [55]:
# get combination, using "+"
combs = atl_addr.Baths + " & " + atl_addr.Beds     # multiple values to multiple values
combs.head(5)

0    4 ba & 4 bds
1    2 ba & 3 bds
2    2 ba & 2 bds
3    1 ba & 2 bds
4    3 ba & 5 bds
dtype: object