## 문자열 처리

#### 문자열 포매팅

In [1]:
 var = 'flesh wound' 
 s = "It's just a {}!"

In [2]:
s.format(var)

"It's just a flesh wound!"

In [3]:
 s.format('scratch')

"It's just a scratch!"

In [6]:
s = """
It's just a {0},
It's just a {1}
""" 
print(s.format('scratch','flesh wound'))


It's just a scratch,
It's just a flesh wound



In [7]:
s = 'Hayden Planetarium Coordinates: {lat}, {lon}' 
print(s.format(lat='40.7815° N', lon='73.9733° W'))

Hayden Planetarium Coordinates: 40.7815° N, 73.9733° W


#### 숫자 데이터 포매팅

In [8]:
print('Some digits of pi: {}'.format(3.14159265359))

Some digits of pi: 3.14159265359


In [9]:
print("In 2005, Lu Chao of China recited {:,} digits of pi".format(67890))

In 2005, Lu Chao of China recited 67,890 digits of pi


In [10]:
print("I remember {0:.4} or {0:.4%} of what Lu Chao recited".format(7/67890))

I remember 0.0001031 or 0.0103% of what Lu Chao recited


In [11]:
print("My ID number is {0}".format(42))
print("My ID number is {0:5}".format(42))
print("My ID number is {0:05}".format(42))

My ID number is 42
My ID number is    42
My ID number is 00042


#### %연산자로 포매팅

In [12]:
s = 'I only know %d digits of pi' % 7 
print(s)

I only know 7 digits of pi


In [13]:
print('Some digits of %(cont)s: %(value).2f' % {'cont': 'e', 'value': 2.718})

Some digits of e: 2.72


####  f-string 포매팅

In [14]:
var = 'flesh wound' 
s = f"It's just a {var}!" 
print(s)

It's just a flesh wound!


In [15]:
lat='40.7815°N' 
lon='73.9733°W' 
s = f'Hayden Planetarium Coordinates: {lat}, {lon}' 
print(s)

Hayden Planetarium Coordinates: 40.7815°N, 73.9733°W


## apply 메서드 활용

In [16]:
def my_sq(x):
    return x ** 2
 
def my_exp(x, n):
    return x ** n
 
print(my_sq(4))

16


In [17]:
print(my_exp(2,4))

16


In [18]:
import pandas as pd
 
df = pd.DataFrame({'a': [10, 20, 30], 'b': [20, 30, 40]}) 
 
print(df)

    a   b
0  10  20
1  20  30
2  30  40


In [19]:
print(df['a'] ** 2)

0    100
1    400
2    900
Name: a, dtype: int64


In [20]:
sq = df['a'].apply(my_sq) 
print(sq)

0    100
1    400
2    900
Name: a, dtype: int64


In [21]:
ex = df['a'].apply(my_exp, n=2) 
print(ex)

0    100
1    400
2    900
Name: a, dtype: int64


In [22]:
ex = df['a'].apply(my_exp, n=3) 
print(ex)

0     1000
1     8000
2    27000
Name: a, dtype: int64


In [23]:
df = pd.DataFrame({'a': [10, 20, 30], 'b': [20, 30, 40]}) 
print(df)

    a   b
0  10  20
1  20  30
2  30  40


In [24]:
def print_me(x): 
    print(x)

In [25]:
print(df.apply(print_me)) #열방향 적용

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64
a    None
b    None
dtype: object


In [26]:
print(df.apply(print_me,axis = 1)) #행방향 적용

a    10
b    20
Name: 0, dtype: int64
a    20
b    30
Name: 1, dtype: int64
a    30
b    40
Name: 2, dtype: int64
0    None
1    None
2    None
dtype: object


In [28]:
# apply에 적용할 평균을 구하는 함수 생성

def avg_3_apply(col): #열 방향으로 적용
    sum = 0
    for item in col:
         sum += item
    return sum /df.shape[0]
 
def avg_2_apply(row): #행 방향으로 적용
    sum = 0
    for item in row:
        sum += item
    return sum / df.shape[1]

In [29]:
df.apply(avg_3_apply) #열 방향으로 적용

a    20.0
b    30.0
dtype: float64

In [30]:
df.apply(avg_2_apply, axis=1)#행 방향으로 적용

0    15.0
1    25.0
2    35.0
dtype: float64

In [31]:
# 데이터프레임의 누락값을 처리한 다음 apply 사용하기

import seaborn as sns
 
titanic = sns.load_dataset("titanic")
 
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [32]:
import numpy as np
 
def count_missing(vec):
    null_vec = pd.isnull(vec)
    null_count = np.sum(null_vec)
    return null_count

In [33]:
# 누락값 비율을 계산하는 함수

def prop_missing(vec):
    num = count_missing(vec)
    dem = vec.size

In [34]:
# prop_missing 함수를 이용하여 누락값이 아닌 데이터 비율 구하는 함수

def prop_complete(vec):
    return 1 - prop_missing(vec)

In [35]:
cmis_col = titanic.apply(count_missing)
print(cmis_col)

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [36]:
pmis_col = titanic.apply(prop_missing)
print(pmis_col)

survived       None
pclass         None
sex            None
age            None
sibsp          None
parch          None
fare           None
embarked       None
class          None
who            None
adult_male     None
deck           None
embark_town    None
alive          None
alone          None
dtype: object


In [41]:
pcom = titanic.apply(prop_complete)
print(pcom)

TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'

In [38]:
cmis_row = titanic.apply(count_missing, axis=1)
pmis_row = titanic.apply(prop_missing, axis=1)
pcom_row = titanic.apply(prop_complete, axis=1)

TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'

In [39]:
cmis_row.head()

0    1
1    0
2    1
3    0
4    1
dtype: int64

In [40]:
pmis_row.head()

0    None
1    None
2    None
3    None
4    None
dtype: object

In [42]:
pcom_row.head()

NameError: name 'pcom_row' is not defined

In [43]:
titanic['num_missing'] = titanic.apply(count_missing, axis=1)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,1


In [44]:
titanic.loc[titanic.num_missing > 1. :].sample(10)

AssertionError: Start slice bound is non-scalar

## 그룹연산

In [45]:
import pandas as pd 
df = pd.read_csv('data2/gapminder.tsv', sep='\t')
avg_life_exp_by_year = df.groupby('year').lifeExp.mean() 
avg_life_exp_by_year

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [47]:
 def my_mean(values):
    n = len(values) 
    sum = 0 
    for value in values:
        sum += value 
    return sum / n 


In [48]:
agg_my_mean =df.groupby('year').lifeExp.agg(my_mean)
agg_my_mean

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [49]:
def my_mean_diff(values, diff_value): #열, 전체평균수명
    n = len(values) 
    sum = 0 
    for value in values:
        sum += value 
    mean = sum / n 
    return mean - diff_value

In [50]:
global_mean = df.lifeExp.mean()
global_mean

59.474439366197174

In [51]:
agg_mean_diff = df.groupby('year').lifeExp.agg(my_mean_diff, diff_value = global_mean)
agg_mean_diff # 연도별 평균수명과 전체 평균수명의 차이값

year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64

In [52]:
import numpy as np
gdf = df.groupby('year').lifeExp.agg([np.count_nonzero, np.mean, np.std])
gdf

Unnamed: 0_level_0,count_nonzero,mean,std
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,142.0,49.05762,12.225956
1957,142.0,51.507401,12.231286
1962,142.0,53.609249,12.097245
1967,142.0,55.67829,11.718858
1972,142.0,57.647386,11.381953
1977,142.0,59.570157,11.227229
1982,142.0,61.533197,10.770618
1987,142.0,63.212613,10.556285
1992,142.0,64.160338,11.22738
1997,142.0,65.014676,11.559439


In [53]:
gdf_dict = df.groupby('year').agg({'lifeExp' : 'mean', 'pop':'median', 'gdpPercap':'median'})
gdf_dict

Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,3943953.0,1968.528344
1957,51.507401,4282942.0,2173.220291
1962,53.609249,4686039.5,2335.439533
1967,55.67829,5170175.5,2678.33474
1972,57.647386,5877996.5,3339.129407
1977,59.570157,6404036.5,3798.609244
1982,61.533197,7007320.0,4216.228428
1987,63.212613,7774861.5,4280.300366
1992,64.160338,8688686.5,4386.085502
1997,65.014676,9735063.5,4781.825478


## 데이터 변환

In [54]:
def my_zscore(x): 
    return (x - x.mean()) / x.std()

In [55]:
transform_z = df.groupby('year').lifeExp.transform(my_zscore)
transform_z.head() # year 별 lifeExp 의 표준점수

0   -1.656854
1   -1.731249
2   -1.786543
3   -1.848157
4   -1.894173
Name: lifeExp, dtype: float64

In [56]:
df.shape #원본데이터

(1704, 6)

In [57]:
transform_z.shape #표준점수로 변환된 데이터

(1704,)

In [59]:
import seaborn as sns
import numpy as np

np.random.seed(42)
tips_10 = sns.load_dataset('tips').sample(10)
tips_10 = tips_10.rename({'sex':'gender'}, axis='columns')
tips_10

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,25.89,5.16,Male,Yes,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [60]:
tips_10.loc[np.random.permutation(tips_10.index)[:4], 'total_bill'] = np.NaN
tips_10

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,,2.0,Male,No,Sun,Dinner,4
211,,5.16,Male,Yes,Sat,Dinner,4
198,,2.0,Female,Yes,Thur,Lunch,2
176,,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [61]:
def fill_na_mean(x): #x는 성별이 구분된 total_bill열 값이다.
    avg = x.mean() #성별이구분된 total_bill열의 평균값을 구한다.
    return x.fillna(avg) #x가 NaN값면 avg 채워준다.

In [62]:
total_bill_group_mean = tips_10.groupby('gender').total_bill.transform(fill_na_mean)
tips_10['fill_total_bill'] = total_bill_group_mean
tips_10

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size,fill_total_bill
24,19.82,3.18,Male,No,Sat,Dinner,2,19.82
6,8.77,2.0,Male,No,Sun,Dinner,2,8.77
153,,2.0,Male,No,Sun,Dinner,4,17.9525
211,,5.16,Male,Yes,Sat,Dinner,4,17.9525
198,,2.0,Female,Yes,Thur,Lunch,2,13.93
176,,2.0,Male,Yes,Sun,Dinner,2,17.9525
192,28.44,2.56,Male,Yes,Thur,Lunch,2,28.44
124,12.48,2.52,Female,No,Thur,Lunch,2,12.48
9,14.78,3.23,Male,No,Sun,Dinner,2,14.78
101,15.38,3.0,Female,Yes,Fri,Dinner,2,15.38


## 데이터 필터링

In [63]:
tips = sns.load_dataset('tips')
 
print(tips.shape)

(244, 7)


In [64]:
tips['size'].value_counts() #인원수당 주문수

2    156
3     38
4     37
5      5
1      4
6      4
Name: size, dtype: int64

In [67]:
# \ 은 하나의 문장이 두줄로 나타낼때 사용하는 연결기호이다.
#size별로 구분하여 인원수당 count >= 30이상인 값추출 (인원수 1,5,6 명은 걸러진다.)
tips_filtered = tips.groupby('size').\
filter(lambda x: x['size'].count() >= 30)

tips_filtered.shape

(231, 7)

In [68]:
tips_filtered['size'].value_counts() #필터링된 결과

2    156
3     38
4     37
Name: size, dtype: int64

## 그룹 오브젝트

In [69]:
tips_10 = sns.load_dataset('tips').sample(10)
tips_10 = tips_10.rename({'sex':'gender'},axis='columns')
tips_10

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size
243,18.78,3.0,Female,No,Thur,Dinner,2
58,11.24,1.76,Male,Yes,Sat,Dinner,2
227,20.45,3.0,Male,No,Sat,Dinner,4
137,14.15,2.0,Female,No,Thur,Lunch,2
173,31.85,3.18,Male,Yes,Sun,Dinner,2
77,27.2,4.0,Male,No,Thur,Lunch,4
192,28.44,2.56,Male,Yes,Thur,Lunch,2
213,13.27,2.5,Female,Yes,Sat,Dinner,2
10,10.27,1.71,Male,No,Sun,Dinner,2
231,15.69,3.0,Male,Yes,Sat,Dinner,3


In [71]:
grouped = tips_10.groupby('gender')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017F93899708>

In [72]:
grouped.groups

{'Male': [58, 227, 173, 77, 192, 10, 231], 'Female': [243, 137, 213]}

In [73]:
avgs = grouped.mean() 
print(avgs)

        total_bill       tip      size
gender                                
Male     20.734286  2.744286  2.714286
Female   15.400000  2.500000  2.000000


In [74]:
female = grouped.get_group('Female') 
print(female)

     total_bill  tip  gender smoker   day    time  size
243       18.78  3.0  Female     No  Thur  Dinner     2
137       14.15  2.0  Female     No  Thur   Lunch     2
213       13.27  2.5  Female    Yes   Sat  Dinner     2


In [75]:
for gender_group in grouped:
    print(gender_group)

('Male',      total_bill   tip gender smoker   day    time  size
58        11.24  1.76   Male    Yes   Sat  Dinner     2
227       20.45  3.00   Male     No   Sat  Dinner     4
173       31.85  3.18   Male    Yes   Sun  Dinner     2
77        27.20  4.00   Male     No  Thur   Lunch     4
192       28.44  2.56   Male    Yes  Thur   Lunch     2
10        10.27  1.71   Male     No   Sun  Dinner     2
231       15.69  3.00   Male    Yes   Sat  Dinner     3)
('Female',      total_bill  tip  gender smoker   day    time  size
243       18.78  3.0  Female     No  Thur  Dinner     2
137       14.15  2.0  Female     No  Thur   Lunch     2
213       13.27  2.5  Female    Yes   Sat  Dinner     2)


In [76]:
bill_gender_time = tips_10.groupby(['gender', 'time'])
group_avg = bill_gender_time.mean() 
 
print(group_avg)

               total_bill   tip  size
gender time                          
Male   Lunch       27.820  3.28   3.0
       Dinner      17.900  2.53   2.6
Female Lunch       14.150  2.00   2.0
       Dinner      16.025  2.75   2.0


In [77]:
print(type(group_avg))

<class 'pandas.core.frame.DataFrame'>


In [78]:
print(group_avg.columns)

Index(['total_bill', 'tip', 'size'], dtype='object')


In [79]:
print(group_avg.index)

MultiIndex([(  'Male',  'Lunch'),
            (  'Male', 'Dinner'),
            ('Female',  'Lunch'),
            ('Female', 'Dinner')],
           names=['gender', 'time'])


In [81]:
group_method = tips_10.groupby(['gender','time']).mean().reset_index()
group_method

Unnamed: 0,gender,time,total_bill,tip,size
0,Male,Lunch,27.82,3.28,3.0
1,Male,Dinner,17.9,2.53,2.6
2,Female,Lunch,14.15,2.0,2.0
3,Female,Dinner,16.025,2.75,2.0


In [82]:
group_param = tips_10.groupby(['gender','time'], as_index=False).mean()
group_param

Unnamed: 0,gender,time,total_bill,tip,size
0,Male,Lunch,27.82,3.28,3.0
1,Male,Dinner,17.9,2.53,2.6
2,Female,Lunch,14.15,2.0,2.0
3,Female,Dinner,16.025,2.75,2.0
