# Python Lab 04a: Introduction to Scikit-Learn (PCA, Kmeans, etc.) and to Pandas

## Francesco Della Santa, Computational Linear Algebra for Large Scale Problems, Politecnico di Torino

## Initialize PCA objects

In [None]:
pca_general = PCA()
pca_perc = PCA(n_components=0.5, svd_solver='full')
pca_ncomp = PCA(n_components=7)

In [None]:
N, n = 1000, 100
S = np.random.rand(N, n)

# Start using the pca object
pca_ncomp.fit(S)
pca_perc.fit(S)

# Representation of S in the m-dim space of PCs
Qm_ncomp = pca_ncomp.transform(S)
Qm_perc = pca_perc.transform(S)

# Approximation of S obtained using m PCs
Stilde_ncomp = pca_ncomp.inverse_transform(Qm_ncomp)
Stilde_perc = pca_perc.inverse_transform(Qm_perc)

print('*********************** DATASET S ***********************')
display(S)
print('*********************************************************')
print('')
print(f'*********************** DATASET PROJECTED (ncomp: {pca_ncomp.n_components_} PCs) ***********************')
display(Qm_ncomp)
print('*********************************************************************************************************')
print('')
print(f'*********************** DATASET PROJECTED (perc: {pca_perc.n_components_} PCs) ************************')
display(Qm_perc)
print('********************************************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (ncomp) ***********************')
display(Stilde_ncomp)
print('***************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (perc) ***********************')
display(Stilde_perc)
print('**************************************************************************')

In [None]:
display(pca_ncomp.explained_variance_ratio_)

In [None]:
scaler_recent = StandardScaler(with_std=False)
scaler_znorm = StandardScaler()
# Start using the scaler objects
scaler_recent.fit(S)
scaler_znorm.fit(S)
# recentered S
Sbar = scaler_recent.transform(S)
# standardized S
Shat = scaler_znorm.transform(S)

print(f'*********************** DATASET RECENTERED ***********************')
print('SAMPLE MEAN OF RECENTERED DATA:')
display(Sbar.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF RECENTERED DATA:')
display(Sbar.std(axis=1)[:10])
print('')
display(Sbar)
print('*******************************************************************')
print('')
print(f'*********************** DATASET STANDARDIZED ***********************')
print('SAMPLE MEAN OF STANDARDIZED DATA:')
display(Shat.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF STANDARDIZED DATA:')
display(Shat.std(axis=1)[:10])
print('')
display(Shat)
print('**********************************************************************')

### Let's apply the PCA to standardized data

## Initialize $k$-Means

In [None]:
kmeans_default = KMeans()
kmeans_3c = KMeans(n_clusters=3, init='random', algorithm='full')

W0 = np.random.rand(3, 10)
kmeans_3cW0 = KMeans(n_clusters=3, init=W0, algorithm='full')

In [None]:
Snew = np.random.rand(N, n)

km = kmeans_3c  # change the KMeans object here if you want to try another one

# Start using the km object, fitting it on the data S
km.fit(S)

# Prediction of cluster belonging w.r.t. S
S_labels = km.labels_

# Prediction of cluster belonging w.r.t. Snew
Snew_labels = km.predict(Snew)

print(f'*********************** S labels ***********************')
display(S_labels[:10])
print('*********************************************************')
print('')
print(f'*********************** Snew labels ***********************')
display(Snew_labels[:10])
print('************************************************************')

## Initialize Serieses

### Using Arrays

In [None]:
s1 = pd.Series(x, index=[f'index{i}' for i in range(1,11)], name='my_series1')
s2 = pd.Series(x, name='my_series2')

In [None]:
s1 

### Using Dictionaries

In [None]:
d = {'Age':30, 'Height':185, 'Weight':90}
d

In [None]:
s1d

## Initialize DataFrames

### Using Dictionaries

In [None]:
df1d = pd.DataFrame(D)
df1d

In [None]:
df1d.dtypes

In [None]:
X = np.random.rand(10,5)
X

In [None]:
df1 = pd.DataFrame(X, index=range(1, X.shape[0] + 1), columns=[f'column_{i}' for i in range(1, X.shape[1] + 1)])
df1

In [None]:
df1['column_2']

In [None]:
df1['column_6'] = np.random.rand(10)
df1

In [None]:
df1.at[6, 'column_2']

In [None]:
df1.index

In [None]:
df1.axes

In [None]:
df1.loc[[1,7,10], :]

In [None]:
df1.iloc[[0,6,9],[0,2]]

In [None]:
df1.loc[(df1.index > 3) & (df1.index <= 7), df1.columns != 'column_3']

In [None]:
df1.ndim

In [None]:
df1.size

## DataFrame Methods

We use the DataFrame df1 defined above.

### Exploration Methods

In [None]:
df1.tail(2)

In [None]:
df1.info()

In [None]:
df1.isna()

In [None]:
df1.value_counts()

### Statistical Analysis (Basic) and Operations

In [None]:
df1.describe()

In [None]:
df1.mean()

In [None]:
df1.mean(axis=1)

In [None]:
df1.cov()

In [None]:
df1.sample(3, random_state=10)

In [None]:
df1_copy = df1.copy()
df1_fakecopy = df1

In [54]:
df1_copy.at[1, 'column_1'] = np.nan

In [55]:
df1_copy

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,,0.140012,0.531234,0.165353,0.088709,0.683145
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


(Notare che la modifica fatta a df1_fakecopy ha modificato anche df1!)

In [59]:
df1_copy.dropna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


In [62]:
df1_copy.reset_index()

Unnamed: 0,index,column_1,column_2,column_3,column_4,column_5,column_6
0,1,,0.140012,0.531234,0.165353,0.088709,0.683145
1,2,0.609878,0.416714,0.699228,0.625884,0.812659,0.767364
2,3,0.042694,0.232033,0.926032,0.468576,0.863292,0.280647
3,4,0.721244,0.31277,0.048949,0.326429,0.889583,0.275205
4,5,0.899971,0.320781,0.534496,0.052321,0.003123,0.933706
5,6,0.061559,0.665399,0.216179,0.599082,0.187164,0.680909
6,7,0.546871,0.754177,0.600196,0.72023,0.241982,0.607568
7,8,0.484939,0.824883,0.622044,0.834157,0.516021,0.667472
8,9,0.519302,0.346196,0.213139,0.083989,0.850591,0.630632
9,10,0.959985,0.506661,0.44268,0.080178,0.892358,0.210362


### Exportation Methods

In [64]:
df1.to_csv('df1.csv', columns=['column_1', 'column_5', 'column_2'], index_label='ID')

## Loading a DataFrame