In [1]:
import numpy as np
import pandas as pd

In [2]:
np.__version__

'1.17.0'

In [3]:
pd.__version__

'0.25.1'

# Data Analysis with Numpy and Pandas

List have some powerful features such as:
1. Hold different values and types
2. Change, add, and remove

But for data science, we often need to operate on every items in a list. Yet, `list` struggle to do that for us. For example, we want to square or divide every items in a list on another list.

```python
height = [10.73, 1.58, 1.73, 2.89, 1.79]
weight = [65.4, 59.2, 63.6, 88.4, 68.7]
```

In [4]:
# try divide height / weight
height = [10.73, 1.58, 1.73, 2.89, 1.79]
weight = [65.4, 59.2, 63.6, 88.4, 68.7]

In [5]:
# try weight ** 2
height / weight

TypeError: unsupported operand type(s) for /: 'list' and 'list'

In [16]:
my_list = [1, 2, 3, 4, 5]
print(my_list, type(my_list), len(my_list))
my_array = np.array([my_list, my_list])
print(my_array, type(my_array), my_array.shape, len(my_array))

# indexing
print(my_array[0, :], my_array[:, 0], my_array[:, 0:2])
print(my_array[1, :], my_array[:, 1], my_array[:, 2:])

[1, 2, 3, 4, 5] <class 'list'> 5
[[1 2 3 4 5]
 [1 2 3 4 5]] <class 'numpy.ndarray'> (2, 5) 2
[1 2 3 4 5] [1 1]
[1 2 3 4 5] [2 2]


In [18]:
# indexing
print(my_array[:, 0:2])
print(my_array[:, 2:])

[[1 2]
 [1 2]]
[[3 4 5]
 [3 4 5]]


In [28]:
# 2d array
arr_1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
print(arr_1, arr_1.shape)
row, col = arr_1.shape
print("--")
print(arr_1.transpose(), arr_1.transpose().shape)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]] (4, 3)
--
[[ 1  4  7 10]
 [ 2  5  8 11]
 [ 3  6  9 12]] (3, 4)


In [43]:
# 3d array
# arr_3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
# arr_3d = np.array([[[1, 2, 12], [3, 4, 34]], [[5, 6, 56], [7, 8, 78]]])
arr_3d = np.array([[[1, 2, 12], [3, 4, 34], [3, 4, 4]], [[5, 6, 56], [7, 8, 78], [7, 8, 78]]])
print(arr_3d, arr_3d.shape)

[[[ 1  2 12]
  [ 3  4 34]
  [ 3  4  4]]

 [[ 5  6 56]
  [ 7  8 78]
  [ 7  8 78]]] (2, 3, 3)


In [44]:
print(arr_3d[0], arr_3d[0].shape)
print("---")
print(arr_3d[:, 0, :], arr_3d[:, 0, :].shape)
print("---")
print(arr_3d[:, 1, 1:], arr_3d[:, 1, 1:].shape)

[[ 1  2 12]
 [ 3  4 34]
 [ 3  4  4]] (3, 3)
---
[[ 1  2 12]
 [ 5  6 56]] (2, 3)
---
[[ 4 34]
 [ 8 78]] (2, 2)


In [46]:
print(arr_3d[:, 1, 1:], arr_3d[:, 1, 1:].shape)
print(arr_3d[:, 2:, 1:], arr_3d[:, 2:, 1:].shape)

[[ 4 34]
 [ 8 78]] (2, 2)
[[[ 4  4]]

 [[ 8 78]]] (2, 1, 2)


In [48]:
a = np.array([[[1], [2]], [[2], [3]], [[3], [4]], [[4], [5]]])
print(a, a.shape)

[[[1]
  [2]]

 [[2]
  [3]]

 [[3]
  [4]]

 [[4]
  [5]]] (4, 2, 1)


In [55]:
my_ones = np.ones((2, 3), dtype=np.longfloat)
print(my_ones, my_ones.shape)

my_zeros = np.zeros((4, 3))
print(my_zeros, my_zeros.shape)

[[1. 1. 1.]
 [1. 1. 1.]] (2, 3)
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]] (4, 3)


In [81]:
# random generation
np.random.seed(2)

my_rand = np.random.rand(2, 4)
my_rand2 = np.random.rand(2, 4)
print(my_rand, my_rand.shape)
print(my_rand2, my_rand2.shape)

[[0.4359949  0.02592623 0.54966248 0.43532239]
 [0.4203678  0.33033482 0.20464863 0.61927097]] (2, 4)
[[0.29965467 0.26682728 0.62113383 0.52914209]
 [0.13457995 0.51357812 0.18443987 0.78533515]] (2, 4)


In [88]:
my_arange = np.arange(2, 20, 2)
print(my_arange)
my_arange2 = np.arange(0, 1, .05)
print(my_arange2)

[ 2  4  6  8 10 12 14 16 18]
[0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95]


## Pandas

In [8]:
data = pd.read_csv("pseudo_facebook.csv")

In [9]:
data.head(5)

Unnamed: 0,userid,age,dob_day,dob_year,dob_month,gender,tenure,friend_count,friendships_initiated,likes,likes_received,mobile_likes,mobile_likes_received,www_likes,www_likes_received
0,2094382,14,19,1999,11,male,266.0,0,0,0,0,0,0,0,0
1,1192601,14,2,1999,11,female,6.0,0,0,0,0,0,0,0,0
2,2083884,14,16,1999,11,male,13.0,0,0,0,0,0,0,0,0
3,1203168,14,25,1999,12,female,93.0,0,0,0,0,0,0,0,0
4,1733186,14,4,1999,12,male,82.0,0,0,0,0,0,0,0,0


In [10]:
data.tail()

Unnamed: 0,userid,age,dob_day,dob_year,dob_month,gender,tenure,friend_count,friendships_initiated,likes,likes_received,mobile_likes,mobile_likes_received,www_likes,www_likes_received
98998,1268299,68,4,1945,4,female,541.0,2118,341,3996,18089,3505,11887,491,6202
98999,1256153,18,12,1995,3,female,21.0,1968,1720,4401,13412,4399,10592,2,2820
99000,1195943,15,10,1998,5,female,111.0,2002,1524,11959,12554,11959,11462,0,1092
99001,1468023,23,11,1990,4,female,416.0,2560,185,4506,6516,4506,5760,0,756
99002,1397896,39,15,1974,5,female,397.0,2049,768,9410,12443,9410,9530,0,2913


In [11]:
data.shape

(99003, 15)

In [12]:
cols = list(data.columns)
cols

['userid',
 'age',
 'dob_day',
 'dob_year',
 'dob_month',
 'gender',
 'tenure',
 'friend_count',
 'friendships_initiated',
 'likes',
 'likes_received',
 'mobile_likes',
 'mobile_likes_received',
 'www_likes',
 'www_likes_received']

In [13]:
cols2id = {value: key for key, value in enumerate(cols)}
id2cols = {key: value for key, value in enumerate(cols)}
print(cols2id)
print(id2cols)

{'userid': 0, 'age': 1, 'dob_day': 2, 'dob_year': 3, 'dob_month': 4, 'gender': 5, 'tenure': 6, 'friend_count': 7, 'friendships_initiated': 8, 'likes': 9, 'likes_received': 10, 'mobile_likes': 11, 'mobile_likes_received': 12, 'www_likes': 13, 'www_likes_received': 14}
{0: 'userid', 1: 'age', 2: 'dob_day', 3: 'dob_year', 4: 'dob_month', 5: 'gender', 6: 'tenure', 7: 'friend_count', 8: 'friendships_initiated', 9: 'likes', 10: 'likes_received', 11: 'mobile_likes', 12: 'mobile_likes_received', 13: 'www_likes', 14: 'www_likes_received'}


In [14]:
data[cols[1:7]]

Unnamed: 0,age,dob_day,dob_year,dob_month,gender,tenure
0,14,19,1999,11,male,266.0
1,14,2,1999,11,female,6.0
2,14,16,1999,11,male,13.0
3,14,25,1999,12,female,93.0
4,14,4,1999,12,male,82.0
...,...,...,...,...,...,...
98998,68,4,1945,4,female,541.0
98999,18,12,1995,3,female,21.0
99000,15,10,1998,5,female,111.0
99001,23,11,1990,4,female,416.0


In [111]:
data.loc[5:100, ["bedrooms", "bathrooms", "price", "lat", "long", "condition"]]b

Unnamed: 0,bedrooms,bathrooms,price,lat,long,condition
5,4,4.50,1225000.0,47.6561,-122.005,3
6,3,2.25,257500.0,47.3097,-122.327,3
7,3,1.50,291850.0,47.4095,-122.315,3
8,3,1.00,229500.0,47.5123,-122.337,3
9,3,2.50,323000.0,47.3684,-122.031,3
...,...,...,...,...,...,...
96,3,1.75,247500.0,47.3576,-122.277,3
97,4,1.50,199000.0,47.3036,-122.378,4
98,3,1.75,314000.0,47.4109,-121.958,5
99,3,2.50,437500.0,47.4838,-121.714,3


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null int64
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB


In [114]:
data.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [116]:
my_data = [
    {"name": "john", "age": 20, "height": 171},
    {"name": "josh", "age": 30, "height": 166},
]
df = pd.DataFrame(my_data)

In [121]:
df.to_csv("mydata.csv", index=False, sep=";")

In [140]:
mydf = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
mydf2 = pd.DataFrame([[11, 12], [13, 14]], columns=['a', 'b'])

In [147]:
mydf.append(mydf2, ignore_index=True, sort=False)

Unnamed: 0,a,b
0,1,2
1,3,4
2,11,12
3,13,14


In [143]:
newdf = pd.concat([mydf, mydf2], axis=1)
newdf

Unnamed: 0,a,b,a.1,b.1
0,1,2,11,12
1,3,4,13,14


In [146]:
newdf.loc[:, "a"]

Unnamed: 0,a,a.1
0,1,11
1,3,13
