# Latihan 3 | Data Reduction

In [1]:
import pandas as pd
from sklearn.decomposition import PCA

data = pd.read_csv('./data/d_data.csv')

pca = PCA(n_components=2).fit_transform(data)
print(pca)

[[ -7.84170828 -12.59183041]
 [ -9.2638174   -1.06657762]
 [-20.96602518  -3.92729715]
 [-12.43201934 -25.18583558]
 [ 16.01355999  16.63692306]
 [ -5.13191019   2.38929446]
 [  9.33623593  -5.27897273]
 [  6.58085739 -16.319062  ]
 [ 12.23414033   0.30674688]
 [ -7.51111002   6.77052294]
 [ 36.217585    -2.34725768]
 [ 14.32048033  -9.2296796 ]
 [ -5.13301929  18.98084587]
 [ 17.49829341   5.33277962]
 [-13.78517421  20.25751047]
 [ -1.51513576  25.29789293]
 [-11.83756983   1.57183023]
 [-21.19439155  -3.83927071]
 [  4.41072868 -17.75856299]]


# Latihan 4 | Data Transformation

## Latihan 4.1 | Aggregate

### Latihan 4.1.1 | Basic One-Variable Grouped Aggregation

In [3]:
# Load the gapminder data
import pandas as pd

df = pd.read_csv('./data/gapminder.tsv', sep='\t')

# calculate the average life expectancy for each year
avg_life_exp_by_year = df.groupby('year').lifeExp.mean()
print(avg_life_exp_by_year)

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


In [4]:
avg_life_exp_by_year = df.groupby('year')['lifeExp'].mean()

In [5]:
# get a list of unique years in the data
years = df.year.unique()
print(years)

[1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007]


In [6]:
# subset the data for the year 1952
y1952 = df.loc[df.year == 1952, :]
print(y1952.head())

        country continent  year  lifeExp       pop    gdpPercap
0   Afghanistan      Asia  1952   28.801   8425333   779.445314
12      Albania    Europe  1952   55.230   1282697  1601.056136
24      Algeria    Africa  1952   43.077   9279525  2449.008185
36       Angola    Africa  1952   30.015   4232095  3520.610273
48    Argentina  Americas  1952   62.485  17876956  5911.315053


In [7]:
y1952_mean = y1952.lifeExp.mean()
print(y1952_mean)

49.05761971830987


### Latihan 4.1.2 | Built-in Aggregation Methods

In [8]:
# group by continent and describe each group
continent_describe = df.groupby('continent').lifeExp.describe()
print(continent_describe)

           count       mean        std     min       25%      50%       75%  \
continent                                                                     
Africa     624.0  48.865330   9.150210  23.599  42.37250  47.7920  54.41150   
Americas   300.0  64.658737   9.345088  37.579  58.41000  67.0480  71.69950   
Asia       396.0  60.064903  11.864532  28.801  51.42625  61.7915  69.50525   
Europe     360.0  71.903686   5.433178  43.585  69.57000  72.2410  75.45050   
Oceania     24.0  74.326208   3.795611  69.120  71.20500  73.6650  77.55250   

              max  
continent          
Africa     76.442  
Americas   80.653  
Asia       82.603  
Europe     81.757  
Oceania    81.235  


### Latihan 4.1.3 | Aggregation Functions

In [19]:
df.groupby('continent', as_index = False).agg({"lifeExp": "describe"})

Unnamed: 0_level_0,continent,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
0,Africa,624.0,48.86533,9.15021,23.599,42.3725,47.792,54.4115,76.442
1,Americas,300.0,64.658737,9.345088,37.579,58.41,67.048,71.6995,80.653
2,Asia,396.0,60.064903,11.864532,28.801,51.42625,61.7915,69.50525,82.603
3,Europe,360.0,71.903686,5.433178,43.585,69.57,72.241,75.4505,81.757
4,Oceania,24.0,74.326208,3.795611,69.12,71.205,73.665,77.5525,81.235


## Latihan 4.2 | Normalization

In [9]:
# Load the gapminder data
import pandas as pd

df = pd.read_csv('./data/gapminder.tsv', sep='\t')

def my_zscore(x):
    '''Calculates the z-score of provided data
    'x' is a vector or series of values.
    '''
    return((x - x.mean()) / x.std())

In [10]:
transform_z = df.groupby('year').lifeExp.transform(my_zscore)

In [11]:
# note the number of rows in our data
print(df.shape)

(1704, 6)


In [12]:
# note the number of values in our transformation
print(transform_z.shape)

(1704,)


In [13]:
# import the zscore function from scipy.stats
from scipy.stats import zscore

# calculate a grouped zscore
sp_z_grouped = df.groupby('year').lifeExp.transform(zscore)

# calculate a nongrouped zscore
sp_z_nongroup = zscore(df.lifeExp)

In [14]:
# grouped z-score
print(transform_z.head())

0   -1.656854
1   -1.731249
2   -1.786543
3   -1.848157
4   -1.894173
Name: lifeExp, dtype: float64


In [15]:
# grouped z-score using scipy
print(sp_z_grouped.head())

0   -1.662719
1   -1.737377
2   -1.792867
3   -1.854699
4   -1.900878
Name: lifeExp, dtype: float64


In [16]:
# nongrouped z-score
print(sp_z_nongroup[:5])

[-2.37533395 -2.25677417 -2.1278375  -1.97117751 -1.81103275]
