In [1]:
print('Hello Notebook')

Hello Notebook


## numpy

In [2]:
# loading numpy library
import numpy as np

In [3]:
# generate random data
data = np.random.randn(3, 3)

In [4]:
data

array([[-0.13864309,  0.88135215, -1.13428807],
       [-0.38385624,  1.78930114, -0.74583598],
       [ 0.2943901 ,  0.04100402,  2.51669631]])

In [5]:
# return a tuple of integers, showing the size 
# of the array in each dimension
# for matrix with n rows and m columns,the shape 
# will be (n, m)
data.shape

(3, 3)

In [6]:
# element-wise add
# a new array is created
data + data

array([[-0.27728617,  1.76270431, -2.26857614],
       [-0.76771247,  3.57860227, -1.49167196],
       [ 0.58878019,  0.08200805,  5.03339263]])

In [7]:
# element-wise product
# a new array is created
data * data

array([[1.92219056e-02, 7.76781621e-01, 1.28660942e+00],
       [1.47345610e-01, 3.20159855e+00, 5.56271306e-01],
       [8.66655294e-02, 1.68133002e-03, 6.33376033e+00]])

In [8]:
# matrix product
# a new array is created
# check if it is differnt from element-wise product
np.dot(data, data)

array([[-0.65301379,  1.40830065, -3.35474155],
       [-0.85318211,  2.83270376, -2.77616427],
       [ 0.68433567,  0.43602457,  5.96925488]])

In [9]:
# element-wise divide
# a new array is created
1 / data

array([[-7.21276496,  1.13462025, -0.88161026],
       [-2.60514199,  0.55887742, -1.34077737],
       [ 3.39685339, 24.38785003,  0.39734631]])

In [10]:
# a convenient noation for matrix transpose
# a new array is created
data.T

array([[-0.13864309, -0.38385624,  0.2943901 ],
       [ 0.88135215,  1.78930114,  0.04100402],
       [-1.13428807, -0.74583598,  2.51669631]])

In [11]:
# verify if original data is changed after transpose
data

array([[-0.13864309,  0.88135215, -1.13428807],
       [-0.38385624,  1.78930114, -0.74583598],
       [ 0.2943901 ,  0.04100402,  2.51669631]])

In [12]:
# another notation for matrix transpose
# the result is the same as calling data.T
np.transpose(data)

array([[-0.13864309, -0.38385624,  0.2943901 ],
       [ 0.88135215,  1.78930114,  0.04100402],
       [-1.13428807, -0.74583598,  2.51669631]])

In [13]:
# more linear algebra
# inverse of a square matrix
# a new array is created
np.linalg.inv(data)

array([[ 7.03239668, -3.5127101 ,  2.1285278 ],
       [ 1.15789578, -0.02326605,  0.51497461],
       [-0.84147869,  0.4112777 ,  0.13997178]])

In [14]:
# math functions, such as sin, cos, and exp
# these functions operate in element-wise style
# a new array is created
np.exp(data)

array([[ 0.87053868,  2.41416182,  0.32165103],
       [ 0.68122936,  5.98526811,  0.4743376 ],
       [ 1.34230743,  1.0418563 , 12.38760423]])

In [15]:
data

array([[-0.13864309,  0.88135215, -1.13428807],
       [-0.38385624,  1.78930114, -0.74583598],
       [ 0.2943901 ,  0.04100402,  2.51669631]])

In [16]:
# basic indexing
# index starts from 0
# element at second row and third column
# a new array is created
data[1, 2]

-0.7458359778075954

In [18]:
# slicing
# obtain a subset of array, this is similar to 
# slicing in Python list object
# rows from index 1 (included) to index 3 (excluded),
# columns from index 1 (included) to index 3 (excluded)
# a new array is created
data[1:3, 0:3]

array([[-0.38385624,  1.78930114, -0.74583598],
       [ 0.2943901 ,  0.04100402,  2.51669631]])

In [20]:
# advanced indexing
# indexed by integer arrays
idx = np.array([0, 1])         
print(idx)
# select first and second rows of the array
data[idx]

[0 1]


array([[-0.13864309,  0.88135215, -1.13428807],
       [-0.38385624,  1.78930114, -0.74583598]])

In [21]:
# select first and second columns of the array
data[:, idx]

array([[-0.13864309,  0.88135215],
       [-0.38385624,  1.78930114],
       [ 0.2943901 ,  0.04100402]])

In [22]:
# advanced indexing
# indexed by boolean arrays
# set all the negative values in data to 0
data[data < 0] = 0

In [23]:
# check if data is changed:
# all values are greater or equal to 0
data

array([[0.        , 0.88135215, 0.        ],
       [0.        , 1.78930114, 0.        ],
       [0.2943901 , 0.04100402, 2.51669631]])

In [21]:
# functions for descriptive statistics,
# such as sum, mean, median, std, max, min
# for mean, default is to compute the mean of 
# the flattened array
np.mean(data)

0.5535737642404467

In [24]:
# axis along which the means are computed
# for 2-dimensional array, there are two axes:
# axis 0: running vertically downwards across rows,
# aixs 1: running horizontally across columns
np.mean(data, axis=1)

array([0.29378405, 0.59643371, 0.95069681])

In [25]:
# reading text data

# StringIO behaves like a file object
from io import StringIO
c = StringIO("0 1\n2 3")

# np.loadtxt(fname,...)
# fname can be a file object or filename
np.loadtxt(c)

array([[0., 1.],
       [2., 3.]])

## pandas

In [4]:
# loading pandas library
import pandas as pd
import numpy as np

In [5]:
# create a Series object from a Python list
obj1 = pd.Series([4, 7, -5, 3])

In [6]:
obj1

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:
obj1.values

array([ 4,  7, -5,  3], dtype=int64)

In [8]:
# an index object is immutable and is responsible for
# holding axis labels and other metadata
obj1.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
# an index with labels
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [10]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [12]:
# indexing
obj2['a']

-5

In [13]:
# slicing
obj2[['a', 'b', 'd']]

a   -5
b    7
d    4
dtype: int64

In [14]:
# selection
obj2[obj2 < 5]

d    4
a   -5
c    3
dtype: int64

In [15]:
# create DataFrame from a Python dictionary
data = {'city': ['Beijing', 'Beijing', 'KARACHI',
                  'Shanghai', 'Shanghai', 'Shanghai'], 
        'year': [1990, 2000, 2010, 1990, 2000, 2010],
        'population': [10.8, 13.6, 19.6, 13.3, 16.4, 23.0]}
frame = pd.DataFrame(data)

In [16]:
# show the first 5 rows
frame.head()

Unnamed: 0,city,year,population
0,Beijing,1990,10.8
1,Beijing,2000,13.6
2,KARACHI,2010,19.6
3,Shanghai,1990,13.3
4,Shanghai,2000,16.4


In [17]:
frame['population']

0    10.8
1    13.6
2    19.6
3    13.3
4    16.4
5    23.0
Name: population, dtype: float64

In [18]:
frame.year

0    1990
1    2000
2    2010
3    1990
4    2000
5    2010
Name: year, dtype: int64

In [19]:
frame.columns

Index(['city', 'year', 'population'], dtype='object')

In [20]:
frame['country'] = 'China' 

In [21]:
frame

Unnamed: 0,city,year,population,country
0,Beijing,1990,10.8,China
1,Beijing,2000,13.6,China
2,KARACHI,2010,19.6,China
3,Shanghai,1990,13.3,China
4,Shanghai,2000,16.4,China
5,Shanghai,2010,23.0,China


In [22]:
frame.values

array([['Beijing', 1990, 10.8, 'China'],
       ['Beijing', 2000, 13.6, 'China'],
       ['KARACHI', 2010, 19.6, 'China'],
       ['Shanghai', 1990, 13.3, 'China'],
       ['Shanghai', 2000, 16.4, 'China'],
       ['Shanghai', 2010, 23.0, 'China']], dtype=object)

In [23]:
# sort values by a column
# return sorted DataFrame
frame.sort_values(by='population')

Unnamed: 0,city,year,population,country
0,Beijing,1990,10.8,China
3,Shanghai,1990,13.3,China
1,Beijing,2000,13.6,China
4,Shanghai,2000,16.4,China
2,KARACHI,2010,19.6,China
5,Shanghai,2010,23.0,China


In [24]:
frame['population'].describe()

count     6.000000
mean     16.116667
std       4.519476
min      10.800000
25%      13.375000
50%      15.000000
75%      18.800000
max      23.000000
Name: population, dtype: float64

In [25]:
frame['population'].sum()

96.69999999999999

In [26]:
frame['population'].mean()

16.116666666666664

In [27]:
frame['population'].std()

4.519476370849467

In [28]:
obj3 = pd.Series([4, np.nan, -5, 3])

In [29]:
obj3

0    4.0
1    NaN
2   -5.0
3    3.0
dtype: float64

In [30]:
# fill NAs by the mean value
# fillna return a new Series
# assign it to obj3
obj3 = obj3.fillna(obj3.mean())

In [56]:
obj3

0    4.000000
1    0.666667
2   -5.000000
3    3.000000
dtype: float64

In [57]:
obj4 = pd.Series([1, 2, np.nan, 4])

In [58]:
# fill NAs by an interpolated Series
# fillna return a new Series
# assign it to obj4
obj4 = obj4.fillna(obj4.interpolate(method='linear'))

In [59]:
obj4

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [60]:
# remove duplicated values
# the first observed one is kept by default
# return a new DataFrame
frame.drop_duplicates('city')

Unnamed: 0,city,year,population,Country,country
0,Beijing,1990,10.8,china,China
2,KARACHI,2010,19.6,china,China
3,Shanghai,1990,13.3,china,China


In [61]:
frame['log_population'] = np.log(frame['population'])

In [62]:
frame

Unnamed: 0,city,year,population,Country,country,log_population
0,Beijing,1990,10.8,china,China,2.379546
1,Beijing,2000,13.6,china,China,2.61007
2,KARACHI,2010,19.6,china,China,2.97553
3,Shanghai,1990,13.3,china,China,2.587764
4,Shanghai,2000,16.4,china,China,2.797281
5,Shanghai,2010,23.0,china,China,3.135494


In [58]:
# str.lower() return a new column
# assign it the original DataFrame
frame['city'] = frame['city'].str.lower()

In [59]:
frame

Unnamed: 0,city,year,population,country,log_population
0,beijing,1990,10.8,China,2.379546
1,beijing,2000,13.6,China,2.61007
2,beijing,2010,19.6,China,2.97553
3,shanghai,1990,13.3,China,2.587764
4,shanghai,2000,16.4,China,2.797281
5,shanghai,2010,23.0,China,3.135494


In [63]:
# write data to a text file
# sep is the delimiter for the output file
# in this case, use a space to separate values
# index is a boolean: whether write row names (index)
# columns is the columns to write
frame.to_csv('frame.txt', sep=' ', index=False,
             columns=['city', 'year', 'population'])

# MATPLOT

In [35]:
matplotlib notebook

In [36]:
s = pd.Series(np.random.randn(10).cumsum(),
              index=np.arange(0, 100, 10))

In [44]:
s.plot(figure=plt.figure())


<IPython.core.display.Javascript object>

<AxesSubplot:>

In [38]:
s.plot()

<IPython.core.display.Javascript object>

<AxesSubplot:>

In [42]:
# loading matplotlib library
import matplotlib.pyplot as plt

In [43]:
# fig.add_subplot(x, y, i)
# x: number of rows
# y: number of columns
# i: index of the subplot
# e.g. fig.add_subplot(1, 1, 1) #just one plot in the figure
# see the official documentation to learn more
fig = plt.figure()
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
ax4 = fig.add_subplot(2, 2, 4)
ax1.hist(np.random.randn(100),
         bins=20, color='k', alpha=0.3)
ax2.scatter(np.arange(30),
            np.arange(30) + 3 * np.random.randn(30))
ax3.plot(np.random.randn(50).cumsum(), 'k-.')
ax4.plot(np.sin(np.random.randn(100).cumsum()), 'k--')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x28783318a90>]

## statsmodels

In [67]:
# ignore some warning messages before importing
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# codes modified from Python for Data Analysis
# by Wes McKinney
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [68]:
def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * np.random.randn(*size)

In [69]:
# generate random data

# number of observations
N=100

# construct values of independent variable
X = np.c_[dnorm(0, 0.4, size=N),
          dnorm(0, 0.6, size=N),
          dnorm(0, 0.2, size=N)]

# random noise
eps = dnorm(0, 0.1, size=N)

# regression coefficients
beta = [0.1, 0.3, 0.5]

# construct values of dependent variable
y = np.dot(X, beta) + eps

In [70]:
# create a DataFrame for the data
data = pd.DataFrame(X, columns=['x1', 'x2', 'x3'])
data['y'] = y

In [71]:
data.head()

Unnamed: 0,x1,x2,x3,y
0,0.113386,0.328887,0.176896,0.281413
1,1.5317,0.744785,0.030383,0.574844
2,-0.343157,1.560617,-0.09971,0.964428
3,1.166322,-0.420031,0.495947,-0.386127
4,0.639385,0.019169,0.264958,0.21334


In [72]:
# fit regression model with the data
# conceptual relationship:
# y = x1 + x2 + x3
# estimation method: OLS
model = smf.ols('y ~ x1 + x2 + x3', data=data)
results = model.fit()

In [73]:
# estimated regression coefficients
# Intercept is beta_0
results.params

Intercept   -0.018570
x1           0.134576
x2           0.318066
x3           0.389104
dtype: float64

In [74]:
# visualization
regression_fig = sm.graphics.plot_regress_exog(results, "x1")

<IPython.core.display.Javascript object>