## 2. Pandas

In [1]:
%pip install pandas



In [2]:
import pandas as pd

In [14]:
# Series
data = ['a','b','c','d','e']

se = pd.Series(data)
type(se)
se.index
se.values
se[0:3]

se.name = "alphabet"
se.index.name = 'index'
print(se)

index
0    a
1    b
2    c
3    d
4    e
Name: alphabet, dtype: object


In [34]:
# DF (DataFrame = Excel)
data = {
    "country": ["kor","usa","china","japan"],
    "rank": [1,2,3,4],
    "grade": ["A","B","C","D"]
}

df = pd.DataFrame(data) # csv to json, json to csv, xlsx

df

Unnamed: 0,country,rank,grade
0,kor,1,A
1,usa,2,B
2,china,3,C
3,japan,4,D


In [30]:
# 데이터 셀렉션 > 데이터를 불러오는 방법
# (1) df.칼럼, df["칼럼"]

df.grade

df[['country', 'rank', 'grade']]

Unnamed: 0,country,rank,grade
0,kor,1,A
1,usa,2,B
2,china,3,C
3,japan,4,D


In [37]:
# (2) df.loc[인덱스값, 컬럼명] // df.iloc[] > location기반 항상 숫자값을 필요로 함
df.loc[:, ['grade', 'country']]

# df[df["rank"] > 2]['grade','country']]
df.loc[df['rank'] > 2, ['grade','country']]

Unnamed: 0,grade,country
2,C,china
3,D,japan


In [40]:
# 삭제하는 방법 (drop)
# (1) 단순하게 행 데이터를 삭제하는 방법
df.drop([3], inplace=True)  # inplace > 지우는 것을 허락

KeyError: '[3] not found in axis'

In [43]:
# (2) 컬럼을 삭제
df = df.drop('rank', axis=1)
df

KeyError: "['rank'] not found in axis"

In [45]:
df['rank'] = [1,2,3]
df

Unnamed: 0,country,grade,rank
0,kor,A,1
1,usa,B,2
2,china,C,3


In [49]:
# 기술 통계
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  3 non-null      object
 1   grade    3 non-null      object
 2   rank     3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


Unnamed: 0,rank
count,3.0
mean,2.0
std,1.0
min,1.0
25%,1.5
50%,2.0
75%,2.5
max,3.0


In [55]:
# 데이터 정렬 > sort_index or sort_values()
df['new_rank'] = [10,100,30]

df.sort_values(by='new_rank', ascending=False)  # 내림차순

Unnamed: 0,country,grade,rank,new_rank
1,usa,B,2,100
2,china,C,3,30
0,kor,A,1,10


In [64]:
# null 데이터 처리(NaN) 처리
# isnull() > 데이터의 null 여부
# fillna() > null 데이터를 채우기
# dropna() > null 데이터를 지우기
import numpy as np
df['grade'] = np.nan

# df.isnull().sum()
df.fillna(value='pass', inplace=True)
df.loc[1, 'grade'] = np.nan

In [66]:
df.dropna(how='any')

Unnamed: 0,country,grade,rank,new_rank
0,kor,pass,1,10
2,china,pass,3,30


## 3. 영화 데이터 분석 [데이터 링크](https://visioneer.notion.site/1-3b35a57015d6417d8cd6476917436242)

In [67]:
import pandas as pd

json_data = {
  "columns": ["Movie", "Release Year", "Audience", "Rating"],
  "index": [0, 1, 2, 3, 4, 5, 6, 7],
  "data": [
    ["Avengers", 2012, 1500, 8.8],
    ["Interstellar", 2014, 1100, 9.1],
    ["Frozen", 2013, 1020, 8.5],
    ["About Time", 2013, 950, 8.7],
    ["The Dark Knight", 2008, 1300, 9.0],
    ["Inception", 2010, 1200, 8.8],
    ["La La Land", 2016, 800, 8.6],
    ["Toy Story", 2010, 980, 8.5]
  ]
}

# JSON 데이터를 DataFrame으로 변환
df = pd.DataFrame(json_data['data'], columns=json_data['columns'])
df

Unnamed: 0,Movie,Release Year,Audience,Rating
0,Avengers,2012,1500,8.8
1,Interstellar,2014,1100,9.1
2,Frozen,2013,1020,8.5
3,About Time,2013,950,8.7
4,The Dark Knight,2008,1300,9.0
5,Inception,2010,1200,8.8
6,La La Land,2016,800,8.6
7,Toy Story,2010,980,8.5


In [78]:
# 1) 전체 데이터 중 movie 컬럼만 호출
df.Movie

# 2) 전체 데이터 중 movie, rating을 출력
df[['Movie','Rating']]

# 3) 2013년 이후에 개봉한 영화 중 movie, rating 출력
df[df['Release Year'] > 2013][["Movie","Rating"]]
df.loc[df['Release Year'] > 2013, ["Movie","Rating"]]

Unnamed: 0,Movie,Rating
1,Interstellar,9.1
6,La La Land,8.6


In [85]:
# 4) 주어진 계산식을 참고하여 recommend Column을 추가하세요
# recommend = (Audience) * rating) // 100

df['Recoomend'] = (df['Audience'] * df['Rating']) // 100
df

# 5) 전체 데이터를 recommend 기준으로 큰 값 호출
df.sort_values(by='Recoomend', ascending=False).head(5)

Unnamed: 0,Movie,Release Year,Audience,Rating,Recoomend
0,Avengers,2012,1500,8.8,132.0
4,The Dark Knight,2008,1300,9.0,117.0
5,Inception,2010,1200,8.8,105.0
1,Interstellar,2014,1100,9.1,100.0
2,Frozen,2013,1020,8.5,86.0
