## 파일 입출력
- read_csv() : https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
- 그 외 파일 형식 불러오기 자료 : https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

In [1]:
import pandas as pd

In [10]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [11]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [9]:
# train 데이터셋의 dimension 출력
train.shape # row 개수 : 165034, column 개수 : 14

(165034, 14)

In [13]:
result = train.loc[:, ['id', 'Age']]

# 파일 내보내기
result.to_csv('result.csv', index=False) # indexFalse 시 index 번호는 열로 추출 안함

## 데이터 불러오기 

In [42]:
# 데이터셋이 링크로 주어진 경우 아래와 같이 링크를 작성하여 불러올 수 있다
titanic = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## textual data(문자열 데이터) 다루기
- link : https://pandas.pydata.org/docs/getting_started/intro_tutorials/10_text_data.html

In [29]:
# Name 열에 있는 정보 소문자로 바꾸기
titanic['Name'].str.lower()

0        MALE
1      FEMALE
2      FEMALE
3      FEMALE
4        MALE
        ...  
886      MALE
887    FEMALE
888    FEMALE
889      MALE
890      MALE
Name: Sex, Length: 891, dtype: object

In [30]:
# Sex 열에 있는 정보 대문자로 바꾸기
titanic['Sex'].str.upper()

0        MALE
1      FEMALE
2      FEMALE
3      FEMALE
4        MALE
        ...  
886      MALE
887    FEMALE
888    FEMALE
889      MALE
890      MALE
Name: Sex, Length: 891, dtype: object

In [18]:
titanic["Name"].str.split(",")

0                             [Braund,  Mr. Owen Harris]
1      [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                              [Heikkinen,  Miss. Laina]
3        [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                            [Allen,  Mr. William Henry]
                             ...                        
886                             [Montvila,  Rev. Juozas]
887                      [Graham,  Miss. Margaret Edith]
888          [Johnston,  Miss. Catherine Helen "Carrie"]
889                             [Behr,  Mr. Karl Howell]
890                               [Dooley,  Mr. Patrick]
Name: Name, Length: 891, dtype: object

In [20]:
# 성에 대한 정보를 의미하는 새로운 칼럼 Surname 생성
titanic['Surname'] = titanic['Name'].str.split(',').str.get(0)
titanic['Surname']

0         Braund
1        Cumings
2      Heikkinen
3       Futrelle
4          Allen
         ...    
886     Montvila
887       Graham
888     Johnston
889         Behr
890       Dooley
Name: Surname, Length: 891, dtype: object

In [31]:
# Name 칼럼에서 'Countess'가 있는지 boolean 타입으로 확인
titanic['Name'].str.contains('Countess')

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Name, Length: 891, dtype: bool

In [33]:
# Name 칼럼에서 'Countess'를 포함하는 행 추출
titanic[titanic['Name'].str.contains('Countess')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,B77,S


In [23]:
# Name 칼럼에 있는 데이터의 길이 구하기
# Series.str.len() 메소드
titanic['Name'].str.len()

0      23
1      51
2      22
3      44
4      24
       ..
886    21
887    28
888    40
889    21
890    19
Name: Name, Length: 891, dtype: int64

In [35]:
type(titanic['Name'])

pandas.core.series.Series

In [37]:
# Name 칼럼에 있는 데이터 중 길이가 가장 긴 인덱스 정보 가져오기
titanic['Name'].str.len().idxmax()

307

In [38]:
# Name 칼럼에 있는 데이터 중 이름이 가장 긴 데이터 출력
titanic.loc[titanic['Name'].str.len().idxmax(), 'Name']

'Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)'

In [41]:
titanic['Sex_short']

0      M
1      F
2      F
3      F
4      M
      ..
886    M
887    F
888    F
889    M
890    M
Name: Sex_short, Length: 891, dtype: object

In [39]:
# Sex 칼럼에 있는 데이터 바꾸기 male -> M, female -> F
# Sereies.replace()
titanic['Sex_short'] = titanic['Sex'].replace({'male' : 'M', 'female' : 'F'})
titanic['Sex_short']

0      M
1      F
2      F
3      F
4      M
      ..
886    M
887    F
888    F
889    M
890    M
Name: Sex_short, Length: 891, dtype: object

In [45]:
titanic['Sex_short'] = titanic['Sex'].str.replace('female', 'F')
titanic['Sex_short'] = titanic['Sex_short'].str.replace('male', 'M')
titanic['Sex_short']

0      M
1      F
2      F
3      F
4      M
      ..
886    M
887    F
888    F
889    M
890    M
Name: Sex_short, Length: 891, dtype: object