# We have been working with several of Python's libraries that were developed to work with numerical arrays and data manipulation.  These are *numpy* and *pandas* respectively

# *numpy*, short for numerical Python introduces the ndarray[] and great functionality for working with arrays of numbers.  *pandas* builds on *numpy* and provides the DataFrame -- a flexible object for storing data.  *pandas* includes many methods to manipulate DataFrames.  It also provides the Series object which is a column (or vector) of data

In [1]:
# import the modules we need
# using 'as' allows us to use an alias
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import statistics as s
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
# create an array -- this is one dimensional like a vector
# numpy defines the ndarray -- an n-dimensional array
x = np.array([1, 2, 3, 7, 4, 5, 4, 6, 8, 10, 3, 4, 4, 3, 4, 5])
len(x)

16

In [3]:
np.var(x) # population variance 

4.87109375

In [4]:
s.var(x)  ## Error -- no var() in statistics module

AttributeError: module 'statistics' has no attribute 'var'

In [5]:
# display the mean
# either use the mean() method with instance x
# or call it using np explicity and pasing x as a parameter
# or use the built-in mean from the statistics module
x.mean()

4.5625

In [6]:
np.mean(x)

4.5625

In [8]:
s.mean(x).astype(float)

4.0

In [10]:
# weighted mean
weights=[0.0625, 0.0625,0.1,0.0025,0.0625,0.1225,0.025,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,
                       0.0625,0.0625,0.0625]
print(sum(weights))
np.average(x, weights=[0.0625, 0.0625,0.1225,0.0025,0.0625,0.1225,0.025,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,
                       0.0625,0.0625,0.0625])

1.0


4.374083129584353

In [11]:
# standard deviation
# we can use print to format our output
print('%.4f' % x.std())
print('%.4f' % np.std(x))
print('%.4f' % s.stdev(x))

2.2071
2.2071
2.2361


In [12]:
# let's take another look explicitly calculating the variance
y = (x-x.mean())**2
z = (y.sum()/len(x))**(0.5)
print('%.4f' % z)

2.2071


In [13]:
# notice np.std() divides by n by default which gives us the population variance
# change the parameter ddof=1 to divide by n-1 for an unbiased sample standard deviation
print('%.4f' % x.std(ddof=1))
print('%.4f' % np.std(x, ddof=1))

2.2794
2.2794


In [14]:
# check it 'by hand'
w = (y.sum()/(len(x)-1))**(0.5)
print('%.4f' % w)

2.2794


In [15]:
# find the median of the data -- the middle value
print(np.median(x))
print(s.median(x))

4.0
4.0


In [16]:
# Note that x.median() does not work
x.median()  ## Throws an error

AttributeError: 'numpy.ndarray' object has no attribute 'median'

In [18]:
# and the mode -- the most frequently occurring value
np.mode(x)  ## Error!  NumPy does not have a mode()

AttributeError: module 'numpy' has no attribute 'mode'

In [19]:
x.mode() ## This doesn't work either

AttributeError: 'numpy.ndarray' object has no attribute 'mode'

In [20]:
s.mode(x)

4

In [21]:
# you can set the precision of the output
np.set_printoptions(precision=4) 

# create a two dimensional array
# summary stats for the entire array
# x2 has two rows and five columns
x2 = np.array([[1, 2, 3, 4, 6], [7, 8, 10, 9, 12]])
print(np.mean(x2), np.median(x2), '%.4f' % np.std(x2, ddof=1), sep='\t') 

6.2	6.5	3.6454


In [22]:
# summary stats column wise - set axis = 0
print(np.mean(x2, axis=0), np.median(x2, axis=0), np.std(x2, ddof=1, axis=0), sep='\t')

# row wise  - set axis = 1 -- print formatting retained
print(np.mean(x2, axis=1), np.median(x2, axis=1), np.std(x2, ddof=1, axis=1), sep='\t')

[4.  5.  6.5 6.5 9. ]	[4.  5.  6.5 6.5 9. ]	[4.2426 4.2426 4.9497 3.5355 4.2426]
[3.2 9.2]	[3. 9.]	[1.9235 1.9235]


In [23]:
# Create a pandas Series -- a one dimensional array (aka a vector)
# then display the summary statistics
y = pd.Series(x)
y.describe()

count    16.000000
mean      4.562500
std       2.279437
min       1.000000
25%       3.000000
50%       4.000000
75%       5.250000
max      10.000000
dtype: float64

In [24]:
# scipy stats also has a describe() method
# displays summary statistics
sp.stats.describe(x) 

DescribeResult(nobs=16, minmax=(1, 10), mean=4.5625, variance=5.195833333333334, skewness=0.8408279053105321, kurtosis=0.3611104501645972)

In [25]:
# create a two dimensional array
# summary stats for the entire array
import numpy as np
x2 = np.array([[1, 2, 3, 4, 6], [7, 8, 10, 9, 12]])
print(np.mean(x2), np.median(x2), '%.4f' % np.std(x2, ddof=1), sep='\t') 

6.2	6.5	3.6454


In [26]:
# we can calculate the correlation in several ways as well
# np.corrcoef rreturns an np array
np.corrcoef(x2)

array([[1.    , 0.9324],
       [0.9324, 1.    ]])

In [27]:
# calculate Pearson's rho and the p-value
# the p-value suggest the corelation is statistically significant
import scipy as sp
rho, pstat = sp.stats.pearsonr(x2[0, ], x2[1, ])
print('rho = %.4f' % rho, 'p-val = %.4f' % pstat)

rho = 0.9324 p-val = 0.0209


In [28]:
# compare sample means
tstat, pval = sp.stats.ttest_ind(x2[0], x2[1])
print('t-stat = %.3f ' % tstat, 'p-value = %.3f' % pval)

t-stat = -4.932  p-value = 0.001


In [29]:
# linear regression -- we didn't cover it in the lectures
# But here's how 
import statsmodels as sm
import statsmodels.regression.linear_model as lm

Y = np.array([1, 3, 4, 5, 2, 3, 4, 8, 12, 14, 11, 12, 23, 17, 14, 17, 18, 21, 25, 30])
X = np.array([1, 2, 3, 4, 8, 6, 7, 9, 4, 17, 14, 15, 20, 31, 23, 25, 17, 20, 29, 30])
X = sm.tools.tools.add_constant(X)  # adds a 1 to the data so the model can estimate a coefficient term

# fit the model and display results 
model = lm.OLS(Y, X)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.769
Model:,OLS,Adj. R-squared:,0.756
Method:,Least Squares,F-statistic:,59.9
Date:,"Mon, 08 Jul 2019",Prob (F-statistic):,3.91e-07
Time:,14:22:05,Log-Likelihood:,-55.961
No. Observations:,20,AIC:,115.9
Df Residuals:,18,BIC:,117.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5225,1.667,0.913,0.373,-1.980,5.025
x1,0.7493,0.097,7.740,0.000,0.546,0.953

0,1,2,3
Omnibus:,0.209,Durbin-Watson:,1.625
Prob(Omnibus):,0.901,Jarque-Bera (JB):,0.376
Skew:,0.182,Prob(JB):,0.829
Kurtosis:,2.436,Cond. No.,30.7
