# Youtube 頻道排行資料整理
資料網址：https://raw.githubusercontent.com/Code-Gym/python-dataset/master/youtube-channels-data-from-socialblade.csv

In [1]:
import pandas as pd

df= pd.read_csv('https://raw.githubusercontent.com/Code-Gym/python-dataset/master/youtube-channels-data-from-socialblade.csv')
df

Unnamed: 0,Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
0,1st,A++,Zee TV,82757,18752951,20869786591
1,2nd,A++,T-Series,12661,61196302,47548839843
2,3rd,A++,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4th,A++,SET India,27323,31180559,22675948293
4,5th,A++,WWE,36756,32852346,26273668433
...,...,...,...,...,...,...
4995,"4,996th",B+,Uras Benlioğlu,706,2072942,441202795
4996,"4,997th",B+,HI-TECH MUSIC LTD,797,1055091,377331722
4997,"4,998th",B+,Mastersaint,110,3265735,311758426
4998,"4,999th",B+,Bruce McIntosh,3475,32990,14563764


### 確認欄位的資料型態

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           5000 non-null   object
 1   Grade          5000 non-null   object
 2   Channel name   5000 non-null   object
 3   Video Uploads  5000 non-null   object
 4   Subscribers    5000 non-null   object
 5   Video views    5000 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


### 將欄位Rank轉為數字型態
* 移除掉字串後面兩個字元
* 移除掉千分位分隔符號
* 將欄位Rank轉為int型態

In [3]:
# 因為是變series物件，所以要是接.str的方法

In [4]:
df['Rank']= df['Rank'].str[:-2].str.replace(',','').astype('int')

In [5]:
df

Unnamed: 0,Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
0,1,A++,Zee TV,82757,18752951,20869786591
1,2,A++,T-Series,12661,61196302,47548839843
2,3,A++,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4,A++,SET India,27323,31180559,22675948293
4,5,A++,WWE,36756,32852346,26273668433
...,...,...,...,...,...,...
4995,4996,B+,Uras Benlioğlu,706,2072942,441202795
4996,4997,B+,HI-TECH MUSIC LTD,797,1055091,377331722
4997,4998,B+,Mastersaint,110,3265735,311758426
4998,4999,B+,Bruce McIntosh,3475,32990,14563764


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           5000 non-null   int32 
 1   Grade          5000 non-null   object
 2   Channel name   5000 non-null   object
 3   Video Uploads  5000 non-null   object
 4   Subscribers    5000 non-null   object
 5   Video views    5000 non-null   int64 
dtypes: int32(1), int64(1), object(4)
memory usage: 215.0+ KB


### 將欄位Subscribers轉為int型態
若是欄位資訊包含非數字的字串，轉int型態時會發生錯誤

#### 找出包含連字號(Hyphen)的索引值

In [7]:
x=df[df['Subscribers'].str.contains('--')].index # 找出這些特定列的index位置
print(x)

Int64Index([  17,  108,  115,  142,  143,  152,  156,  175,  180,  189,
            ...
            4892, 4893, 4895, 4912, 4936, 4941, 4948, 4956, 4961, 4990],
           dtype='int64', length=387)


#### 使用函式drop刪除有連字號的資料
官網文件：https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html

In [8]:
df.drop(x,axis=0,inplace=True)

#### 將欄位Subscribers轉為int型態

In [9]:
df['Subscribers']=df['Subscribers'].astype('int') # 因為不會直接轉換所以要回傳給自己

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4613 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           4613 non-null   int32 
 1   Grade          4613 non-null   object
 2   Channel name   4613 non-null   object
 3   Video Uploads  4613 non-null   object
 4   Subscribers    4613 non-null   int32 
 5   Video views    4613 non-null   int64 
dtypes: int32(2), int64(1), object(3)
memory usage: 216.2+ KB


### 尋找Grade欄位有幾種等級
函式unique ( ) <br>
A++, A+, A, A-, B+

In [11]:
df['Grade'].nunique() #找出個數

6

In [13]:
df['Grade'].unique() #找出有哪些不同的值

array(['A++ ', 'A+ ', 'A ', 'A- ', 'B+ ', '\xa0 '], dtype=object)

### 將欄位Grade等級，轉換為數字
* 建立對應等級和數字的字典變數
* 使用函式map轉換資料

In [14]:
grade_map={'A++ ':5, 'A+ ':4 , 'A ':3 , 'A- ':2 , 'B+ ':1} # 先建立map的字典

In [15]:
df['Grade']=df['Grade'].map(grade_map) # 再用.map()來取代進目標欄位

In [16]:
df

Unnamed: 0,Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
0,1,5.0,Zee TV,82757,18752951,20869786591
1,2,5.0,T-Series,12661,61196302,47548839843
2,3,5.0,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4,5.0,SET India,27323,31180559,22675948293
4,5,5.0,WWE,36756,32852346,26273668433
...,...,...,...,...,...,...
4995,4996,1.0,Uras Benlioğlu,706,2072942,441202795
4996,4997,1.0,HI-TECH MUSIC LTD,797,1055091,377331722
4997,4998,1.0,Mastersaint,110,3265735,311758426
4998,4999,1.0,Bruce McIntosh,3475,32990,14563764
