# Python Lab 04a: Introduction to Scikit-Learn (PCA, Kmeans, etc.) and to Pandas

## Francesco Della Santa, Computational Linear Algebra for Large Scale Problems, Politecnico di Torino

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


## Initialize PCA objects

In [2]:
pca_general = PCA()
pca_perc = PCA(n_components=0.5, svd_solver='full')
pca_ncomp = PCA(n_components=7)

### And let's use them...

In [3]:
N, n = 1000, 100
S = np.random.rand(N, n)

# Start using the pca object
pca_ncomp.fit(S)
pca_perc.fit(S)

# Representation of S in the m-dim space of PCs
Qm_ncomp = pca_ncomp.transform(S)
Qm_perc = pca_perc.transform(S)

# Approximation of S obtained using m PCs
Stilde_ncomp = pca_ncomp.inverse_transform(Qm_ncomp)
Stilde_perc = pca_perc.inverse_transform(Qm_perc)

print('*********************** DATASET S ***********************')
display(S)
print('*********************************************************')
print('')
print(f'*********************** DATASET PROJECTED (ncomp: {pca_ncomp.n_components_} PCs) ***********************')
display(Qm_ncomp)
print('*********************************************************************************************************')
print('')
print(f'*********************** DATASET PROJECTED (perc: {pca_perc.n_components_} PCs) ************************')
display(Qm_perc)
print('********************************************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (ncomp) ***********************')
display(Stilde_ncomp)
print('***************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (perc) ***********************')
display(Stilde_perc)
print('**************************************************************************')

*********************** DATASET S ***********************


array([[0.53566651, 0.78666213, 0.06997724, ..., 0.72667312, 0.07013775,
        0.92757256],
       [0.71797023, 0.47159458, 0.40845786, ..., 0.27832682, 0.78271398,
        0.06296754],
       [0.28784978, 0.07138505, 0.07683803, ..., 0.44087983, 0.75609448,
        0.71134203],
       ...,
       [0.80967678, 0.56235797, 0.14676289, ..., 0.12305935, 0.49770953,
        0.38172406],
       [0.50146738, 0.61574698, 0.21042342, ..., 0.13583939, 0.03836363,
        0.39566115],
       [0.12405243, 0.05427085, 0.60666235, ..., 0.75171678, 0.76659906,
        0.33587548]])

*********************************************************

*********************** DATASET PROJECTED (ncomp: 7 PCs) ***********************


array([[-0.14289706,  0.16145916, -0.84844963, ...,  0.51520347,
        -0.01483179,  0.43802519],
       [ 0.02653124, -0.12398001, -0.17728013, ...,  0.14672251,
         0.24428667, -0.52313782],
       [ 0.42162141,  0.5333826 , -0.22420605, ...,  0.40871121,
        -0.02255963,  0.6924166 ],
       ...,
       [-0.11094688,  0.3703767 ,  0.37369097, ...,  0.34534167,
        -0.23211099, -0.59320292],
       [-0.21989821,  0.41011648,  0.21521325, ...,  0.40651114,
         1.1135747 ,  0.20400469],
       [ 0.15847758,  0.17459264, -0.24508869, ...,  0.04291636,
        -0.18021909, -0.22168225]])

*********************************************************************************************************

*********************** DATASET PROJECTED (perc: 37 PCs) ************************


array([[ 0.03556666,  0.12493278,  0.89323848, ...,  0.1597615 ,
        -0.03290411, -0.12494172],
       [ 0.02332641, -0.09190828,  0.15753782, ..., -0.07505402,
         0.75431658, -0.08961059],
       [-0.40935314,  0.50644458,  0.18887729, ..., -0.00095823,
        -0.20693717, -0.05114975],
       ...,
       [ 0.1391469 ,  0.48924868, -0.37393527, ...,  0.07700956,
         0.20511396,  0.3614123 ],
       [ 0.15967762,  0.32086041, -0.21747668, ..., -0.00521971,
         0.12475271,  0.25621681],
       [-0.20192801,  0.18634647,  0.22629727, ...,  0.30002269,
         0.0399945 ,  0.31695857]])

********************************************************************************************************

*********************** RECOVERED DATASET S (ncomp) ***********************


array([[0.61358859, 0.4316882 , 0.54299823, ..., 0.52524242, 0.47350967,
        0.511401  ],
       [0.562302  , 0.46798884, 0.60888123, ..., 0.39152411, 0.46249841,
        0.48928136],
       [0.50672636, 0.47403026, 0.38207415, ..., 0.4850853 , 0.50675339,
        0.62828689],
       ...,
       [0.5035061 , 0.46614824, 0.47908756, ..., 0.38232789, 0.61763855,
        0.45520228],
       [0.44209606, 0.4948153 , 0.54305934, ..., 0.39616927, 0.43362802,
        0.45153465],
       [0.50162723, 0.44481298, 0.55385613, ..., 0.43799158, 0.50976928,
        0.51455901]])

***************************************************************************

*********************** RECOVERED DATASET S (perc) ***********************


array([[0.60448145, 0.55869882, 0.39095024, ..., 0.63687854, 0.14844551,
        0.45077149],
       [0.69350451, 0.40075847, 0.44713701, ..., 0.33243674, 0.59487304,
        0.28993506],
       [0.42478324, 0.20345124, 0.29112987, ..., 0.42023984, 0.59543451,
        0.78616123],
       ...,
       [0.45260893, 0.40846886, 0.28562566, ..., 0.04969052, 0.68850066,
        0.54914575],
       [0.39629723, 0.58712409, 0.36026349, ..., 0.27399475, 0.50450972,
        0.63227412],
       [0.46191472, 0.24076361, 0.64601827, ..., 0.42947759, 0.48228334,
        0.40291796]])

**************************************************************************


In [4]:
display(pca_ncomp.explained_variance_ratio_)

array([0.01700256, 0.01649356, 0.01608098, 0.01586973, 0.01544048,
       0.01522833, 0.01506221])

## Initialize the Standard Scaler

In [5]:
scaler_recent = StandardScaler(with_std=False)
scaler_znorm = StandardScaler()
# Start using the scaler objects
scaler_recent.fit(S)
scaler_znorm.fit(S)
# recentered S
Sbar = scaler_recent.transform(S)
# standardized S
Shat = scaler_znorm.transform(S)

print(f'*********************** DATASET RECENTERED ***********************')
print('SAMPLE MEAN OF RECENTERED DATA:')
display(Sbar.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF RECENTERED DATA:')
display(Sbar.std(axis=1)[:10])
print('')
display(Sbar)
print('*******************************************************************')
print('')
print(f'*********************** DATASET STANDARDIZED ***********************')
print('SAMPLE MEAN OF STANDARDIZED DATA:')
display(Shat.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF STANDARDIZED DATA:')
display(Shat.std(axis=1)[:10])
print('')
display(Shat)
print('**********************************************************************')

*********************** DATASET RECENTERED ***********************
SAMPLE MEAN OF RECENTERED DATA:


array([ 0.02196296, -0.0413923 , -0.00980249,  0.01367911, -0.0039938 ,
       -0.02364885,  0.00982203, -0.00102131,  0.01747829,  0.00218793])

SAMPLE ST.DEV. OF RECENTERED DATA:


array([0.29706084, 0.27065461, 0.25224881, 0.29997916, 0.3091373 ,
       0.27965403, 0.27590779, 0.28188359, 0.28754857, 0.30676618])




array([[ 0.04175911,  0.28730255, -0.4328072 , ...,  0.23885407,
        -0.4314514 ,  0.41585525],
       [ 0.22406283, -0.027765  , -0.09432658, ..., -0.20949223,
         0.28112483, -0.44874976],
       [-0.20605762, -0.42797453, -0.42594642, ..., -0.04693922,
         0.25450533,  0.19962473],
       ...,
       [ 0.31576938,  0.06299839, -0.35602155, ..., -0.36475971,
        -0.00387962, -0.12999324],
       [ 0.00755998,  0.1163874 , -0.29236102, ..., -0.35197966,
        -0.46322552, -0.11605615],
       [-0.36985497, -0.44508873,  0.10387791, ...,  0.26389773,
         0.26500991, -0.17584182]])

*******************************************************************

*********************** DATASET STANDARDIZED ***********************
SAMPLE MEAN OF STANDARDIZED DATA:


array([ 0.07656791, -0.14349524, -0.03458505,  0.04911477, -0.0110577 ,
       -0.08131313,  0.03384638, -0.00492534,  0.05971915,  0.00536542])

SAMPLE ST.DEV. OF STANDARDIZED DATA:


array([1.03318398, 0.94336432, 0.87501733, 1.04171113, 1.07223956,
       0.97135159, 0.95742583, 0.9796014 , 0.99778061, 1.06621364])




array([[ 0.1453024 ,  0.99031009, -1.51786536, ...,  0.83373299,
        -1.49827753,  1.47724961],
       [ 0.779635  , -0.09570385, -0.3308056 , ..., -0.73124392,
         0.97624674, -1.59410133],
       [-0.71698519, -1.47519575, -1.49380442, ..., -0.16384389,
         0.88380665,  0.70913027],
       ...,
       [ 1.09873139,  0.21715067, -1.24857622, ..., -1.27321341,
        -0.01347254, -0.46177718],
       [ 0.02630523,  0.40117855, -1.02531721, ..., -1.22860398,
        -1.60861776, -0.41226822],
       [-1.28692421, -1.53418711,  0.36430234, ...,  0.92114925,
         0.92028532, -0.62464587]])

**********************************************************************


### Let's apply the PCA to standardized data

In [6]:
pca = PCA(n_components=7)

# Start with PCA
pca.fit(Shat)
Qm = pca.transform(Shat)

# Recovering of Shat_tilde
Shat_tilde = pca.inverse_transform(Qm)

# Recovering of S_tilde
S_tilde = scaler_znorm.inverse_transform(Shat_tilde)

print('*********************** RECOVERED DATASET Shat ***********************')
display(Shat_tilde)
print('**********************************************************************')
print('*********************** RECOVERED DATASET S ***********************')
display(Shat)
print('*******************************************************************')

*********************** RECOVERED DATASET Shat ***********************


array([[ 0.82611984, -0.01674037, -0.15546842, ...,  0.20784152,
        -0.14050764,  0.31025454],
       [-0.04759922, -0.1485822 ,  0.47405147, ...,  0.12506492,
         0.1434809 , -0.32558324],
       [-0.02906813, -0.14694074, -0.42900768, ..., -0.0357213 ,
         0.07902969,  0.40714213],
       ...,
       [-0.40669749, -0.0565555 ,  0.11253086, ..., -0.30305186,
         0.17284155, -0.17963792],
       [-0.04052932, -0.16693124,  0.12179542, ...,  0.32678724,
         0.56230943, -0.24245394],
       [ 0.01349107, -0.12300233,  0.24586766, ..., -0.39025335,
        -0.34362597,  0.18991251]])

**********************************************************************
*********************** RECOVERED DATASET S ***********************


array([[ 0.1453024 ,  0.99031009, -1.51786536, ...,  0.83373299,
        -1.49827753,  1.47724961],
       [ 0.779635  , -0.09570385, -0.3308056 , ..., -0.73124392,
         0.97624674, -1.59410133],
       [-0.71698519, -1.47519575, -1.49380442, ..., -0.16384389,
         0.88380665,  0.70913027],
       ...,
       [ 1.09873139,  0.21715067, -1.24857622, ..., -1.27321341,
        -0.01347254, -0.46177718],
       [ 0.02630523,  0.40117855, -1.02531721, ..., -1.22860398,
        -1.60861776, -0.41226822],
       [-1.28692421, -1.53418711,  0.36430234, ...,  0.92114925,
         0.92028532, -0.62464587]])

*******************************************************************


## Initialize $k$-Means

In [7]:
kmeans_default = KMeans()
kmeans_3c = KMeans(n_clusters=3, init='random', algorithm='full')

W0 = np.random.rand(3, 10)
kmeans_3cW0 = KMeans(n_clusters=3, init=W0, algorithm='full')

### And let's use one of them...

In [8]:
Snew = np.random.rand(N, n)

km = kmeans_3c  # change the KMeans object here if you want to try another one

# Start using the km object, fitting it on the data S
km.fit(S)

# Prediction of cluster belonging w.r.t. S
S_labels = km.labels_

# Prediction of cluster belonging w.r.t. Snew
Snew_labels = km.predict(Snew)

print(f'*********************** S labels ***********************')
display(S_labels[:10])
print('*********************************************************')
print('')
print(f'*********************** Snew labels ***********************')
display(Snew_labels[:10])
print('************************************************************')



*********************** S labels ***********************


array([1, 1, 2, 2, 2, 1, 2, 0, 1, 1])

*********************************************************

*********************** Snew labels ***********************


array([0, 2, 0, 1, 1, 0, 2, 1, 2, 0])

************************************************************


## Initialize Serieses

### Using Arrays

In [9]:
x = np.random.rand(10)
x

array([0.64461643, 0.04827068, 0.36600185, 0.71668562, 0.08284247,
       0.9886752 , 0.60809559, 0.61479261, 0.13734408, 0.02835941])

In [10]:
s1 = pd.Series(x, index=[f'index{i}' for i in range(1,11)], name='my_series1')
s2 = pd.Series(x, name='my_series2')

In [11]:
s1 

index1     0.644616
index2     0.048271
index3     0.366002
index4     0.716686
index5     0.082842
index6     0.988675
index7     0.608096
index8     0.614793
index9     0.137344
index10    0.028359
Name: my_series1, dtype: float64

In [12]:
s2

0    0.644616
1    0.048271
2    0.366002
3    0.716686
4    0.082842
5    0.988675
6    0.608096
7    0.614793
8    0.137344
9    0.028359
Name: my_series2, dtype: float64

### Using Dictionaries

In [13]:
d = {'Age':30, 'Height':185, 'Weight':90}
d

{'Age': 30, 'Height': 185, 'Weight': 90}

In [14]:
s1d = pd.Series(d)

In [15]:
s1d

Age        30
Height    185
Weight     90
dtype: int64

## Initialize DataFrames

### Using Dictionaries

In [16]:
D = {'Float_random':np.random.rand(10), 'Integer_random':np.random.permutation(10)}
D

{'Float_random': array([0.2584962 , 0.50666653, 0.35963522, 0.90111776, 0.76662491,
        0.52911099, 0.567562  , 0.27904151, 0.55966822, 0.85745155]),
 'Integer_random': array([2, 0, 5, 6, 7, 9, 8, 1, 4, 3])}

In [17]:
df1d = pd.DataFrame(D)
df1d

Unnamed: 0,Float_random,Integer_random
0,0.258496,2
1,0.506667,0
2,0.359635,5
3,0.901118,6
4,0.766625,7
5,0.529111,9
6,0.567562,8
7,0.279042,1
8,0.559668,4
9,0.857452,3


In [18]:
df1d.dtypes

Float_random      float64
Integer_random      int32
dtype: object

### Using Arrays

In [19]:
X = np.random.rand(10,5)
X

array([[0.93274847, 0.14001167, 0.53123361, 0.1653525 , 0.08870943],
       [0.60987757, 0.41671405, 0.69922765, 0.62588375, 0.81265881],
       [0.04269385, 0.23203311, 0.92603185, 0.4685763 , 0.86329175],
       [0.72124404, 0.31277022, 0.04894935, 0.32642883, 0.88958255],
       [0.89997147, 0.32078097, 0.53449584, 0.05232064, 0.00312293],
       [0.06155894, 0.66539908, 0.21617857, 0.5990816 , 0.1871644 ],
       [0.54687115, 0.75417703, 0.60019556, 0.72022973, 0.24198225],
       [0.48493874, 0.82488343, 0.62204434, 0.83415689, 0.51602075],
       [0.5193024 , 0.34619589, 0.21313909, 0.08398871, 0.85059098],
       [0.9599845 , 0.50666108, 0.44267963, 0.08017813, 0.89235792]])

In [20]:
df1 = pd.DataFrame(X, index=range(1, X.shape[0] + 1), columns=[f'column_{i}' for i in range(1, X.shape[1] + 1)])
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5
1,0.932748,0.140012,0.531234,0.165353,0.088709
2,0.609878,0.416714,0.699228,0.625884,0.812659
3,0.042694,0.232033,0.926032,0.468576,0.863292
4,0.721244,0.31277,0.048949,0.326429,0.889583
5,0.899971,0.320781,0.534496,0.052321,0.003123
6,0.061559,0.665399,0.216179,0.599082,0.187164
7,0.546871,0.754177,0.600196,0.72023,0.241982
8,0.484939,0.824883,0.622044,0.834157,0.516021
9,0.519302,0.346196,0.213139,0.083989,0.850591
10,0.959985,0.506661,0.44268,0.080178,0.892358


## Extract/Add Column

In [21]:
df1['column_2']

1     0.140012
2     0.416714
3     0.232033
4     0.312770
5     0.320781
6     0.665399
7     0.754177
8     0.824883
9     0.346196
10    0.506661
Name: column_2, dtype: float64

In [22]:
df1['column_6'] = np.random.rand(10)
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.932748,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


## DataFrame Attributes

We use the DataFrame df1 defined above.

In [23]:
df1.at[6, 'column_2']

0.6653990763886776

In [24]:
df1.iat[5, 1]

0.6653990763886776

In [25]:
df1.index

RangeIndex(start=1, stop=11, step=1)

In [26]:
df1.columns

Index(['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6'], dtype='object')

In [27]:
df1.axes

[RangeIndex(start=1, stop=11, step=1),
 Index(['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6'], dtype='object')]

In [28]:
df1.loc[[1,7,10], :]

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.932748,0.140012,0.531234,0.165353,0.088709,0.683145
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [29]:
df1.loc[[1,7,10], ['column_1', 'column_3']]

Unnamed: 0,column_1,column_3
1,0.932748,0.531234
7,0.546871,0.600196
10,0.959985,0.44268


In [30]:
df1.iloc[[0,6,9],[0,2]]

Unnamed: 0,column_1,column_3
1,0.932748,0.531234
7,0.546871,0.600196
10,0.959985,0.44268


In [31]:
df1.loc[(df1.index > 3) & (df1.index <= 7), df1.columns != 'column_3']

Unnamed: 0,column_1,column_2,column_4,column_5,column_6
4,0.721244,0.31277,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.72023,0.241982,0.607568


In [32]:
df1.shape

(10, 6)

In [33]:
df1.ndim

2

In [34]:
df1.size

60

In [35]:
df1.values

array([[0.93274847, 0.14001167, 0.53123361, 0.1653525 , 0.08870943,
        0.68314462],
       [0.60987757, 0.41671405, 0.69922765, 0.62588375, 0.81265881,
        0.76736354],
       [0.04269385, 0.23203311, 0.92603185, 0.4685763 , 0.86329175,
        0.28064674],
       [0.72124404, 0.31277022, 0.04894935, 0.32642883, 0.88958255,
        0.27520474],
       [0.89997147, 0.32078097, 0.53449584, 0.05232064, 0.00312293,
        0.93370552],
       [0.06155894, 0.66539908, 0.21617857, 0.5990816 , 0.1871644 ,
        0.68090852],
       [0.54687115, 0.75417703, 0.60019556, 0.72022973, 0.24198225,
        0.60756751],
       [0.48493874, 0.82488343, 0.62204434, 0.83415689, 0.51602075,
        0.66747159],
       [0.5193024 , 0.34619589, 0.21313909, 0.08398871, 0.85059098,
        0.63063153],
       [0.9599845 , 0.50666108, 0.44267963, 0.08017813, 0.89235792,
        0.21036152]])

## DataFrame Methods

We use the DataFrame df1 defined above.

### Exploration Methods

In [36]:
df1.head(3)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.932748,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647


In [37]:
df1.tail(2)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [38]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 1 to 10
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   column_1  10 non-null     float64
 1   column_2  10 non-null     float64
 2   column_3  10 non-null     float64
 3   column_4  10 non-null     float64
 4   column_5  10 non-null     float64
 5   column_6  10 non-null     float64
dtypes: float64(6)
memory usage: 612.0 bytes


In [39]:
df1.nunique()

column_1    10
column_2    10
column_3    10
column_4    10
column_5    10
column_6    10
dtype: int64

In [40]:
df1.nunique(axis=1)

1     6
2     6
3     6
4     6
5     6
6     6
7     6
8     6
9     6
10    6
dtype: int64

In [41]:
df1.isna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False
10,False,False,False,False,False,False


In [42]:
df1.count()

column_1    10
column_2    10
column_3    10
column_4    10
column_5    10
column_6    10
dtype: int64

In [43]:
df1.value_counts()

column_1  column_2  column_3  column_4  column_5  column_6
0.042694  0.232033  0.926032  0.468576  0.863292  0.280647    1
0.061559  0.665399  0.216179  0.599082  0.187164  0.680909    1
0.484939  0.824883  0.622044  0.834157  0.516021  0.667472    1
0.519302  0.346196  0.213139  0.083989  0.850591  0.630632    1
0.546871  0.754177  0.600196  0.720230  0.241982  0.607568    1
0.609878  0.416714  0.699228  0.625884  0.812659  0.767364    1
0.721244  0.312770  0.048949  0.326429  0.889583  0.275205    1
0.899971  0.320781  0.534496  0.052321  0.003123  0.933706    1
0.932748  0.140012  0.531234  0.165353  0.088709  0.683145    1
0.959985  0.506661  0.442680  0.080178  0.892358  0.210362    1
dtype: int64

In [44]:
df1.value_counts(normalize=True)

column_1  column_2  column_3  column_4  column_5  column_6
0.042694  0.232033  0.926032  0.468576  0.863292  0.280647    0.1
0.061559  0.665399  0.216179  0.599082  0.187164  0.680909    0.1
0.484939  0.824883  0.622044  0.834157  0.516021  0.667472    0.1
0.519302  0.346196  0.213139  0.083989  0.850591  0.630632    0.1
0.546871  0.754177  0.600196  0.720230  0.241982  0.607568    0.1
0.609878  0.416714  0.699228  0.625884  0.812659  0.767364    0.1
0.721244  0.312770  0.048949  0.326429  0.889583  0.275205    0.1
0.899971  0.320781  0.534496  0.052321  0.003123  0.933706    0.1
0.932748  0.140012  0.531234  0.165353  0.088709  0.683145    0.1
0.959985  0.506661  0.442680  0.080178  0.892358  0.210362    0.1
dtype: float64

### Statistical Analysis (Basic) and Operations

In [45]:
df1.describe()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.577919,0.451963,0.483418,0.39562,0.534548,0.573701
std,0.327159,0.229406,0.261432,0.292444,0.36926,0.238224
min,0.042694,0.140012,0.048949,0.052321,0.003123,0.210362
25%,0.49353,0.314773,0.272804,0.10433,0.200869,0.362377
50%,0.578374,0.381455,0.532865,0.397503,0.66434,0.649052
75%,0.85529,0.625715,0.616582,0.619183,0.860117,0.682586
max,0.959985,0.824883,0.926032,0.834157,0.892358,0.933706


In [46]:
df1.describe(percentiles=[0.13, 0.87, 0.99])

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.577919,0.451963,0.483418,0.39562,0.534548,0.573701
std,0.327159,0.229406,0.261432,0.292444,0.36926,0.238224
min,0.042694,0.140012,0.048949,0.052321,0.003123,0.210362
13%,0.133534,0.245758,0.213656,0.080826,0.105447,0.27613
50%,0.578374,0.381455,0.532865,0.397503,0.66434,0.649052
87%,0.927176,0.739085,0.686106,0.704191,0.885113,0.753046
99%,0.957533,0.81852,0.905619,0.823903,0.892108,0.918735
max,0.959985,0.824883,0.926032,0.834157,0.892358,0.933706


In [47]:
df1.mean()

column_1    0.577919
column_2    0.451963
column_3    0.483418
column_4    0.395620
column_5    0.534548
column_6    0.573701
dtype: float64

In [48]:
df1.mean(axis=1)

1     0.423533
2     0.655288
3     0.468879
4     0.429030
5     0.457400
6     0.401715
7     0.578504
8     0.658253
9     0.440641
10    0.515370
dtype: float64

In [49]:
df1.corr()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
column_1,1.0,-0.271619,-0.178106,-0.571419,-0.126151,0.096583
column_2,-0.271619,1.0,-0.018128,0.700749,-0.139108,0.121982
column_3,-0.178106,-0.018128,1.0,0.326856,-0.047286,0.094755
column_4,-0.571419,0.700749,0.326856,1.0,-0.037631,0.105063
column_5,-0.126151,-0.139108,-0.047286,-0.037631,1.0,-0.694035
column_6,0.096583,0.121982,0.094755,0.105063,-0.694035,1.0


In [50]:
df1.cov()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
column_1,0.107033,-0.020386,-0.015233,-0.054671,-0.01524,0.007527
column_2,-0.020386,0.052627,-0.001087,0.047012,-0.011784,0.006666
column_3,-0.015233,-0.001087,0.068347,0.02499,-0.004565,0.005901
column_4,-0.054671,0.047012,0.02499,0.085523,-0.004064,0.007319
column_5,-0.01524,-0.011784,-0.004565,-0.004064,0.136353,-0.061052
column_6,0.007527,0.006666,0.005901,0.007319,-0.061052,0.056751


In [51]:
df1.sample(3, random_state=10)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909


### Attributes, Operation Methods, and Transformation Methods

In [52]:
df1_copy = df1.copy()
df1_fakecopy = df1

In [53]:
df1_fakecopy.at[1, 'column_1'] = 10

In [54]:
df1_copy.at[1, 'column_1'] = np.nan

In [55]:
df1_copy

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [56]:
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


(Notare che la modifica fatta a df1_fakecopy ha modificato anche df1!)

In [57]:
df1.append(df1_copy)

  df1.append(df1_copy)


Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [58]:
df1.drop([1, 3], axis=0)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [59]:
df1_copy.dropna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [60]:
df1_copy.fillna(1000)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,1000.0,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [61]:
df1_copy.rename(index={1:'nuovo_index'}, columns={'column_1':'COLONNA_1'})

Unnamed: 0,COLONNA_1,column_2,column_3,column_4,column_5,column_6
nuovo_index,,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [62]:
df1_copy.reset_index()

Unnamed: 0,index,column_1,column_2,column_3,column_4,column_5,column_6
0,1,,0.140012,0.531234,0.165353,0.088709,0.683145
1,2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
2,3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
3,4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
4,5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
5,6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
6,7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
7,8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
8,9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
9,10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [63]:
df1_copy.sort_values('column_1')

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362
1,,0.140012,0.531234,0.165353,0.088709,0.683145


(Notare il NaN messo in fondo; per maggiori informazioni, guardare la documentazione ufficiale)

### Exportation Methods

In [64]:
df1.to_csv('df1.csv', columns=['column_1', 'column_5', 'column_2'], index_label='ID')

In [65]:
df1.to_pickle('df1.pkl')

## Loading a DataFrame

In [66]:
pd.read_csv('df1.csv')

Unnamed: 0,ID,column_1,column_5,column_2
0,1,10.0,0.088709,0.140012
1,2,0.609878,0.812659,0.416714
2,3,0.042694,0.863292,0.232033
3,4,0.721244,0.889583,0.31277
4,5,0.899971,0.003123,0.320781
5,6,0.061559,0.187164,0.665399
6,7,0.546871,0.241982,0.754177
7,8,0.484939,0.516021,0.824883
8,9,0.519302,0.850591,0.346196
9,10,0.959985,0.892358,0.506661


In [67]:
pd.read_csv('df1.csv', usecols=['ID', 'column_1', 'column_2'], index_col='ID')

Unnamed: 0_level_0,column_1,column_2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.0,0.140012
2,0.609878,0.416714
3,0.042694,0.232033
4,0.721244,0.31277
5,0.899971,0.320781
6,0.061559,0.665399
7,0.546871,0.754177
8,0.484939,0.824883
9,0.519302,0.346196
10,0.959985,0.506661


In [68]:
pd.read_pickle('df1.pkl')

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


## Concatenation of DataFrames

In [69]:
pd.concat([df1, df1.reset_index()], axis=1)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6,index,column_1.1,column_2.1,column_3.1,column_4.1,column_5.1,column_6.1
1,10.0,0.140012,0.531234,0.165353,0.088709,0.683145,2.0,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364,3.0,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647,4.0,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205,5.0,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706,6.0,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909,7.0,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568,8.0,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472,9.0,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632,10.0,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362,,,,,,,
