In [27]:
import pandas as pd
from google.cloud import bigquery
import os
from langdetect import detect, DetectorFactory
from google.cloud import storage
from dotenv import load_dotenv
from pathlib import Path
from io import StringIO

In [28]:
DetectorFactory.seed = 0
pd.set_option('display.max_columns', None)

In [29]:
parent_env_path = Path.cwd().parent / ".env"
load_dotenv(dotenv_path=parent_env_path)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
project_id = os.getenv("PROJECT_ID")

In [30]:
client = storage.Client(project=project_id)
bucket = client.bucket("ytbdata")


Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. 



In [31]:
#list hết các file trong bucket ytbdata/video_detailed_info
blobs = bucket.list_blobs(prefix=r"1_crawl/video_detailed_info/")

list_file = []
for blob in blobs:
    list_file.append(blob.name)
list_file

['1_crawl/video_detailed_info/video_info_1000.json',
 '1_crawl/video_detailed_info/video_info_2001.json',
 '1_crawl/video_detailed_info/video_info_7322.json']

In [32]:
#đọc dữ liệu từ các file và concatenate lại
df = pd.DataFrame()
for i in range(len(list_file)):
    blob = bucket.blob(list_file[i])
    data = blob.download_as_text()
    df1 = pd.read_json(StringIO(data))
    df = pd.concat([df, df1], ignore_index=True)

In [33]:
df

Unnamed: 0,kind,etag,id,snippet,contentDetails,statistics
0,youtube#video,XdTw0tvl3tzaOg9uxWTuOvR556Q,0MQEf_7qk4s,"{'publishedAt': '2024-12-03T17:01:38Z', 'chann...","{'duration': 'PT8S', 'dimension': '2d', 'defin...","{'viewCount': '253586', 'likeCount': '14833', ..."
1,youtube#video,3mvpUDM9S69VWLeHiMtbQQaPKoU,RZdYlS6zvhY,"{'publishedAt': '2024-12-26T04:53:04Z', 'chann...","{'duration': 'PT12S', 'dimension': '2d', 'defi...","{'viewCount': '35260', 'likeCount': '247', 'fa..."
2,youtube#video,Wu3tX7S65HWUyw28UBs71xJmOQM,GRlarIhf1Rw,"{'publishedAt': '2024-08-17T12:24:50Z', 'chann...","{'duration': 'PT18S', 'dimension': '2d', 'defi...","{'viewCount': '703645', 'likeCount': '31338', ..."
3,youtube#video,y6IlQx3b9ATzJ0voyIEPMAq64XA,MOCPCwGm3Dw,"{'publishedAt': '2025-03-20T13:00:25Z', 'chann...","{'duration': 'PT34S', 'dimension': '2d', 'defi...","{'viewCount': '19015', 'likeCount': '710', 'fa..."
4,youtube#video,ArKKKY639fShjmCAi5nbgaGUY5A,ObMnw8woAog,"{'publishedAt': '2023-05-18T16:08:40Z', 'chann...","{'duration': 'PT36S', 'dimension': '2d', 'defi...","{'viewCount': '1985707', 'likeCount': '122736'..."
...,...,...,...,...,...,...
7315,youtube#video,1dYrmgMdxwUZZjOy5Y7l5x_QAAs,PTJ8yizhoIY,"{'publishedAt': '2025-04-04T01:37:46Z', 'chann...","{'duration': 'PT13M3S', 'dimension': '2d', 'de...","{'viewCount': '8903', 'likeCount': '304', 'fav..."
7316,youtube#video,sN8mENCPzbSWIkKQp3pfvgNtHu8,npz4RqhMORA,"{'publishedAt': '2024-10-28T05:00:35Z', 'chann...","{'duration': 'PT3M22S', 'dimension': '2d', 'de...","{'viewCount': '774', 'likeCount': '5', 'favori..."
7317,youtube#video,tou2zgEBFsZ3wdcNJ0ecax2QWYw,CkNjj5nJbr8,"{'publishedAt': '2024-10-16T12:00:01Z', 'chann...","{'duration': 'PT17S', 'dimension': '2d', 'defi...","{'viewCount': '1292540', 'likeCount': '42376',..."
7318,youtube#video,O-lPFW7l_Z1wH5RPauWvulKMsLA,hV1JcavDT98,"{'publishedAt': '2024-01-11T13:49:31Z', 'chann...","{'duration': 'PT3M7S', 'dimension': '2d', 'def...","{'viewCount': '32', 'likeCount': '0', 'favorit..."


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7320 entries, 0 to 7319
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   kind            7320 non-null   object
 1   etag            7320 non-null   object
 2   id              7320 non-null   object
 3   snippet         7320 non-null   object
 4   contentDetails  7320 non-null   object
 5   statistics      7320 non-null   object
dtypes: object(6)
memory usage: 343.3+ KB


In [35]:
df.head()

Unnamed: 0,kind,etag,id,snippet,contentDetails,statistics
0,youtube#video,XdTw0tvl3tzaOg9uxWTuOvR556Q,0MQEf_7qk4s,"{'publishedAt': '2024-12-03T17:01:38Z', 'chann...","{'duration': 'PT8S', 'dimension': '2d', 'defin...","{'viewCount': '253586', 'likeCount': '14833', ..."
1,youtube#video,3mvpUDM9S69VWLeHiMtbQQaPKoU,RZdYlS6zvhY,"{'publishedAt': '2024-12-26T04:53:04Z', 'chann...","{'duration': 'PT12S', 'dimension': '2d', 'defi...","{'viewCount': '35260', 'likeCount': '247', 'fa..."
2,youtube#video,Wu3tX7S65HWUyw28UBs71xJmOQM,GRlarIhf1Rw,"{'publishedAt': '2024-08-17T12:24:50Z', 'chann...","{'duration': 'PT18S', 'dimension': '2d', 'defi...","{'viewCount': '703645', 'likeCount': '31338', ..."
3,youtube#video,y6IlQx3b9ATzJ0voyIEPMAq64XA,MOCPCwGm3Dw,"{'publishedAt': '2025-03-20T13:00:25Z', 'chann...","{'duration': 'PT34S', 'dimension': '2d', 'defi...","{'viewCount': '19015', 'likeCount': '710', 'fa..."
4,youtube#video,ArKKKY639fShjmCAi5nbgaGUY5A,ObMnw8woAog,"{'publishedAt': '2023-05-18T16:08:40Z', 'chann...","{'duration': 'PT36S', 'dimension': '2d', 'defi...","{'viewCount': '1985707', 'likeCount': '122736'..."


In [36]:
df['kind'].value_counts()

kind
youtube#video    7320
Name: count, dtype: int64

In [37]:
def split_json_column(df, column):
    # Convert the JSON string to a dictionary
    df[column] = df[column].apply(lambda x: eval(x) if isinstance(x, str) else x)
    
    # Normalize the JSON column into separate columns
    json_df = pd.json_normalize(df[column])
    
    # Concatenate the new columns with the original DataFrame
    df = pd.concat([df.drop(columns=[column]), json_df], axis=1)
    
    return df

In [38]:
df = split_json_column(df, 'snippet')
df = split_json_column(df, 'contentDetails')
df = split_json_column(df, 'statistics')

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7320 entries, 0 to 7319
Data columns (total 43 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   kind                        7320 non-null   object 
 1   etag                        7320 non-null   object 
 2   id                          7320 non-null   object 
 3   publishedAt                 7320 non-null   object 
 4   channelId                   7320 non-null   object 
 5   title                       7320 non-null   object 
 6   description                 7320 non-null   object 
 7   channelTitle                7320 non-null   object 
 8   categoryId                  7320 non-null   object 
 9   liveBroadcastContent        7320 non-null   object 
 10  thumbnails.default.url      7320 non-null   object 
 11  thumbnails.default.width    7320 non-null   int64  
 12  thumbnails.default.height   7320 non-null   int64  
 13  thumbnails.medium.url       7320 

In [40]:
df.head(5)

Unnamed: 0,kind,etag,id,publishedAt,channelId,title,description,channelTitle,categoryId,liveBroadcastContent,thumbnails.default.url,thumbnails.default.width,thumbnails.default.height,thumbnails.medium.url,thumbnails.medium.width,thumbnails.medium.height,thumbnails.high.url,thumbnails.high.width,thumbnails.high.height,thumbnails.standard.url,thumbnails.standard.width,thumbnails.standard.height,thumbnails.maxres.url,thumbnails.maxres.width,thumbnails.maxres.height,localized.title,localized.description,tags,defaultLanguage,defaultAudioLanguage,duration,dimension,definition,caption,licensedContent,projection,regionRestriction.blocked,regionRestriction.allowed,contentRating.ytRating,viewCount,likeCount,favoriteCount,commentCount
0,youtube#video,XdTw0tvl3tzaOg9uxWTuOvR556Q,0MQEf_7qk4s,2024-12-03T17:01:38Z,UC0xHrJNbxHmFnUP-49WYM-Q,120 mind blowing AI tools #productivity #aitoo...,,SetupsAI,22,none,https://i.ytimg.com/vi/0MQEf_7qk4s/default.jpg,120,90,https://i.ytimg.com/vi/0MQEf_7qk4s/mqdefault.jpg,320,180,https://i.ytimg.com/vi/0MQEf_7qk4s/hqdefault.jpg,480,360,https://i.ytimg.com/vi/0MQEf_7qk4s/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/0MQEf_7qk4s/maxresdefau...,1280.0,720.0,120 mind blowing AI tools #productivity #aitoo...,,,,,PT8S,2d,hd,False,True,rectangular,,,,253586,14833,0,81
1,youtube#video,3mvpUDM9S69VWLeHiMtbQQaPKoU,RZdYlS6zvhY,2024-12-26T04:53:04Z,UC8w4I8t2OpqoOpzzNT1c2dg,PAID vs FREE AI Tools - Best Free AI Tools,PAID vs FREE AI Tools - Best Free AI Tools\nAr...,WebbyFan,26,none,https://i.ytimg.com/vi/RZdYlS6zvhY/default.jpg,120,90,https://i.ytimg.com/vi/RZdYlS6zvhY/mqdefault.jpg,320,180,https://i.ytimg.com/vi/RZdYlS6zvhY/hqdefault.jpg,480,360,https://i.ytimg.com/vi/RZdYlS6zvhY/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/RZdYlS6zvhY/maxresdefau...,1280.0,720.0,PAID vs FREE AI Tools - Best Free AI Tools,PAID vs FREE AI Tools - Best Free AI Tools\nAr...,"[ai, ai tools, best ai tools, new ai tools, be...",en-IN,en,PT12S,2d,hd,False,True,rectangular,,,,35260,247,0,6
2,youtube#video,Wu3tX7S65HWUyw28UBs71xJmOQM,GRlarIhf1Rw,2024-08-17T12:24:50Z,UC0xHrJNbxHmFnUP-49WYM-Q,Top 10 designer AI tools #ai #productivity #de...,,SetupsAI,22,none,https://i.ytimg.com/vi/GRlarIhf1Rw/default.jpg,120,90,https://i.ytimg.com/vi/GRlarIhf1Rw/mqdefault.jpg,320,180,https://i.ytimg.com/vi/GRlarIhf1Rw/hqdefault.jpg,480,360,https://i.ytimg.com/vi/GRlarIhf1Rw/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/GRlarIhf1Rw/maxresdefau...,1280.0,720.0,Top 10 designer AI tools #ai #productivity #de...,,,,,PT18S,2d,hd,False,True,rectangular,,,,703645,31338,0,85
3,youtube#video,y6IlQx3b9ATzJ0voyIEPMAq64XA,MOCPCwGm3Dw,2025-03-20T13:00:25Z,UC7geKfz2-IH0rsgRBtHTm0g,3 Best AI tools for market research,Top 3 AI Market Research Tools: Automate Your ...,Learn With Shopify,27,none,https://i.ytimg.com/vi/MOCPCwGm3Dw/default.jpg,120,90,https://i.ytimg.com/vi/MOCPCwGm3Dw/mqdefault.jpg,320,180,https://i.ytimg.com/vi/MOCPCwGm3Dw/hqdefault.jpg,480,360,https://i.ytimg.com/vi/MOCPCwGm3Dw/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/MOCPCwGm3Dw/maxresdefau...,1280.0,720.0,3 Best AI tools for market research,Top 3 AI Market Research Tools: Automate Your ...,"[AI tools, market research tools, artificial i...",en-US,en-US,PT34S,2d,hd,False,False,rectangular,,,,19015,710,0,5
4,youtube#video,ArKKKY639fShjmCAi5nbgaGUY5A,ObMnw8woAog,2023-05-18T16:08:40Z,UCPkctgt1mTeJWTGj4tq4dPQ,Best 12 AI Tools in 2023,,LKLogic,27,none,https://i.ytimg.com/vi/ObMnw8woAog/default.jpg,120,90,https://i.ytimg.com/vi/ObMnw8woAog/mqdefault.jpg,320,180,https://i.ytimg.com/vi/ObMnw8woAog/hqdefault.jpg,480,360,https://i.ytimg.com/vi/ObMnw8woAog/sddefault.jpg,640.0,480.0,https://i.ytimg.com/vi/ObMnw8woAog/maxresdefau...,1280.0,720.0,Best 12 AI Tools in 2023,,,,,PT36S,2d,hd,False,True,rectangular,,,,1985707,122736,0,440


## Kiểm tra trùng lặp

In [41]:
#Kiểm tra duplicate
df['id'].duplicated().sum()



0

## Ý nghĩa các dòng, các cột

| Column Name                      | Description |
|----------------------------------|-------------|
| `kind`                           | Resource type; e.g., "youtube#video". |
| `etag`                           | Unique identifier for resource versioning, helps in caching. |
| `id`                             | Video ID (or other resource ID if not only videos). |
| `publishedAt`                    | Date and time when the video was published (ISO 8601 format). |
| `channelId`                      | ID of the channel that uploaded the video. |
| `title`                          | Title of the video. |
| `description`                    | Full description of the video. |
| `channelTitle`                   | Name of the channel. |
| `categoryId`                     | ID representing the video’s category (e.g., Music, Education). |
| `liveBroadcastContent`          | Whether the video is `live`, `none`, or `upcoming`. |
| `thumbnails.default.url`        | URL of the default (smallest) thumbnail. |
| `thumbnails.default.width`      | Width of the default thumbnail. |
| `thumbnails.default.height`     | Height of the default thumbnail. |
| `thumbnails.medium.url`         | URL of the medium-quality thumbnail. |
| `thumbnails.medium.width`       | Width of the medium thumbnail. |
| `thumbnails.medium.height`      | Height of the medium thumbnail. |
| `thumbnails.high.url`           | URL of the high-quality thumbnail. |
| `thumbnails.high.width`         | Width of the high thumbnail. |
| `thumbnails.high.height`        | Height of the high thumbnail. |
| `thumbnails.standard.url`       | URL of the standard-quality thumbnail (may be null). |
| `thumbnails.standard.width`     | Width of the standard thumbnail. |
| `thumbnails.standard.height`    | Height of the standard thumbnail. |
| `thumbnails.maxres.url`         | URL of the max resolution thumbnail. |
| `thumbnails.maxres.width`       | Width of the max resolution thumbnail. |
| `thumbnails.maxres.height`      | Height of the max resolution thumbnail. |
| `localized.title`               | Title localized based on the viewer's language setting. |
| `localized.description`         | Description localized based on the viewer's language setting. |
| `tags`                           | List of tags (keywords) associated with the video. |
| `defaultLanguage`               | Language used in the video metadata (ISO 639-1). |
| `defaultAudioLanguage`          | Language of the audio track in the video. |
| `duration`                      | Length of the video in ISO 8601 duration format (e.g., PT5M33S = 5m 33s). |
| `dimension`                     | Video dimension, e.g., `2d` or `3d`. |
| `definition`                    | Quality definition: `hd` (High) or `sd` (Standard). |
| `caption`                       | Whether the video has captions (`true`/`false`). |
| `licensedContent`              | Whether the video contains licensed third-party content. |
| `projection`                    | Projection type (e.g., `rectangular` or `360`). |
| `regionRestriction.blocked`     | List of country codes where the video is **blocked**. |
| `regionRestriction.allowed`     | List of country codes where the video is **allowed**. |
| `contentRating.ytRating`        | YouTube’s content rating (e.g., `ytAgeRestricted`). |
| `viewCount`                     | Number of times the video has been viewed. |
| `likeCount`                     | Number of likes the video has received. |
| `favoriteCount`                 | Number of times the video has been favorited (always 0 for YouTube). |
| `commentCount`                  | Number of comments on the video. |


## Loại bỏ cột dư thừa

In [42]:
drop_columns = [
    'etag', 'kind',
    'thumbnails.medium.url', 'thumbnails.medium.width', 'thumbnails.medium.height',
    'thumbnails.high.url', 'thumbnails.high.width', 'thumbnails.high.height',
    'thumbnails.standard.url', 'thumbnails.standard.width', 'thumbnails.standard.height', 'thumbnails.maxres.url',
    'thumbnails.maxres.width', 'thumbnails.maxres.height', 'thumbnails.default.width', 'thumbnails.default.height',
    'localized.title', 'localized.description','thumbnails.default.url' ,
    'projection', 'liveBroadcastContent',
    'favoriteCount', 
]
df.drop(columns=drop_columns, inplace=True, errors='ignore')

## Kiểm tra kiểu dữ liệu và fix

In [43]:
df.head(5)

Unnamed: 0,id,publishedAt,channelId,title,description,channelTitle,categoryId,tags,defaultLanguage,defaultAudioLanguage,duration,dimension,definition,caption,licensedContent,regionRestriction.blocked,regionRestriction.allowed,contentRating.ytRating,viewCount,likeCount,commentCount
0,0MQEf_7qk4s,2024-12-03T17:01:38Z,UC0xHrJNbxHmFnUP-49WYM-Q,120 mind blowing AI tools #productivity #aitoo...,,SetupsAI,22,,,,PT8S,2d,hd,False,True,,,,253586,14833,81
1,RZdYlS6zvhY,2024-12-26T04:53:04Z,UC8w4I8t2OpqoOpzzNT1c2dg,PAID vs FREE AI Tools - Best Free AI Tools,PAID vs FREE AI Tools - Best Free AI Tools\nAr...,WebbyFan,26,"[ai, ai tools, best ai tools, new ai tools, be...",en-IN,en,PT12S,2d,hd,False,True,,,,35260,247,6
2,GRlarIhf1Rw,2024-08-17T12:24:50Z,UC0xHrJNbxHmFnUP-49WYM-Q,Top 10 designer AI tools #ai #productivity #de...,,SetupsAI,22,,,,PT18S,2d,hd,False,True,,,,703645,31338,85
3,MOCPCwGm3Dw,2025-03-20T13:00:25Z,UC7geKfz2-IH0rsgRBtHTm0g,3 Best AI tools for market research,Top 3 AI Market Research Tools: Automate Your ...,Learn With Shopify,27,"[AI tools, market research tools, artificial i...",en-US,en-US,PT34S,2d,hd,False,False,,,,19015,710,5
4,ObMnw8woAog,2023-05-18T16:08:40Z,UCPkctgt1mTeJWTGj4tq4dPQ,Best 12 AI Tools in 2023,,LKLogic,27,,,,PT36S,2d,hd,False,True,,,,1985707,122736,440


In [44]:
df['caption'].value_counts()

caption
false    6349
true      971
Name: count, dtype: int64

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7320 entries, 0 to 7319
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   id                         7320 non-null   object
 1   publishedAt                7320 non-null   object
 2   channelId                  7320 non-null   object
 3   title                      7320 non-null   object
 4   description                7320 non-null   object
 5   channelTitle               7320 non-null   object
 6   categoryId                 7320 non-null   object
 7   tags                       4855 non-null   object
 8   defaultLanguage            2547 non-null   object
 9   defaultAudioLanguage       5465 non-null   object
 10  duration                   7314 non-null   object
 11  dimension                  7320 non-null   object
 12  definition                 7320 non-null   object
 13  caption                    7320 non-null   object
 14  licensed

After reviewing the data, I noticed several issues that need to be addressed:

**1. Incorrect Data Types:**

* `publishedAt` should be converted from `object` to `datetime`.
* `duration` should be converted into a time format for easier processing and analysis.
* `caption` should be a boolean (`True`/`False`) instead of a string.
* `viewCount`, `likeCount`, and `commentCount` should all be stored as integers, not strings.

**2. Missing Values:**

* Some columns contain null values and will need to be handled accordingly. <br><br>



As a first step, I will correct the data types to ensure the dataset is consistent and ready for further analysis:

* Convert `publishedAt` to `datetime`.
* Convert `duration` into a proper time format. (1h = 60 minutes, 30s = 0.5 minutes,...)
* Transform `caption` into a boolean value.
* Cast `viewCount`, `likeCount`, and `commentCount` as integers.

In [46]:
#change publishedAt to datetime
df['publishedAt'] = pd.to_datetime(df['publishedAt'], errors='coerce')

#change caption to boolean
df['caption'] = df['caption'].apply(lambda x: True if x == 'true' else False)

#cast viewCount, likeCount, commentCount to int
df['viewCount'] = pd.to_numeric(df['viewCount'], errors='coerce')
df['likeCount'] = pd.to_numeric(df['likeCount'], errors='coerce')
df['commentCount'] = pd.to_numeric(df['commentCount'], errors='coerce')

In [47]:
#change duration to minutes
import isodate as i
df['duration_minutes'] = df['duration'].apply(lambda x: round(i.parse_duration(x).total_seconds() / 60 if pd.notnull(x) else 0, 3))
df.drop(columns=['duration'], inplace=True)

In [48]:
df.head(10)

Unnamed: 0,id,publishedAt,channelId,title,description,channelTitle,categoryId,tags,defaultLanguage,defaultAudioLanguage,dimension,definition,caption,licensedContent,regionRestriction.blocked,regionRestriction.allowed,contentRating.ytRating,viewCount,likeCount,commentCount,duration_minutes
0,0MQEf_7qk4s,2024-12-03 17:01:38+00:00,UC0xHrJNbxHmFnUP-49WYM-Q,120 mind blowing AI tools #productivity #aitoo...,,SetupsAI,22,,,,2d,hd,False,True,,,,253586.0,14833.0,81.0,0.133
1,RZdYlS6zvhY,2024-12-26 04:53:04+00:00,UC8w4I8t2OpqoOpzzNT1c2dg,PAID vs FREE AI Tools - Best Free AI Tools,PAID vs FREE AI Tools - Best Free AI Tools\nAr...,WebbyFan,26,"[ai, ai tools, best ai tools, new ai tools, be...",en-IN,en,2d,hd,False,True,,,,35260.0,247.0,6.0,0.2
2,GRlarIhf1Rw,2024-08-17 12:24:50+00:00,UC0xHrJNbxHmFnUP-49WYM-Q,Top 10 designer AI tools #ai #productivity #de...,,SetupsAI,22,,,,2d,hd,False,True,,,,703645.0,31338.0,85.0,0.3
3,MOCPCwGm3Dw,2025-03-20 13:00:25+00:00,UC7geKfz2-IH0rsgRBtHTm0g,3 Best AI tools for market research,Top 3 AI Market Research Tools: Automate Your ...,Learn With Shopify,27,"[AI tools, market research tools, artificial i...",en-US,en-US,2d,hd,False,False,,,,19015.0,710.0,5.0,0.567
4,ObMnw8woAog,2023-05-18 16:08:40+00:00,UCPkctgt1mTeJWTGj4tq4dPQ,Best 12 AI Tools in 2023,,LKLogic,27,,,,2d,hd,False,True,,,,1985707.0,122736.0,440.0,0.6
5,fvxyj34edJU,2024-12-04 23:51:24+00:00,UCyVbhVGzfL9tW9Xc1FkYsbg,Best PAID AI Tools Free Alternatives 2025 | #A...,Title:\nBest PAID AI Tools Free Alternatives 2...,PC Skills Pro,27,"[Free AI for increasing work efficiency 2025, ...",,,2d,hd,False,False,,,,334280.0,7493.0,61.0,0.117
6,3-UzoSScXyE,2025-03-06 04:40:48+00:00,UCHNWKXRh_O3oX5ew47lW44Q,Best Free AI Tools for Research Papers & Essay...,📝 Struggling with writing research papers or e...,Alamin,27,,,,2d,hd,False,False,,,,60165.0,1455.0,33.0,0.6
7,TZe5UqlUg0c,2025-03-05 08:00:32+00:00,UCfJT_eYDTmDE-ovKaxVE1ig,7 Best AI Tools You NEED to Try in 2025 (Free ...,Looking for the best AI tools to boost product...,Kevin Stratvert,28,"[kevin stratvert, tools, best ai tools, free a...",en,en,2d,hd,True,True,,,,213090.0,4678.0,106.0,7.817
8,NQjYMvkDkrE,2025-05-12 12:52:12+00:00,UCXGT7mz2PeIOsMyMpwe5nhQ,how to restore damaged photos using the ChatGP...,Want to get back your photo from the damaged p...,Tech tips and tricks,28,"[shorts, aishorts, aitoolsdemo, aitools2025, a...",en,en,2d,hd,False,False,,,,613.0,6.0,0.0,0.433
9,eFy8tepEByw,2025-03-14 11:30:45+00:00,UCY6N8zZhs2V7gNTUxPuKWoQ,BEST AI TOOLS TO USE IN 2025!🔥,📸 Instagram: https://bit.ly/ishansharma7390ig\...,Ishan Sharma,24,"[ai tools, ai tools to save time, ai tools for...",,en-IN,2d,hd,False,True,,,,4802595.0,205401.0,203.0,1.133


With the data types corrected, we will now adressing missing values. We Will start with the categorical columns (bool, object,...)

In [49]:
categorical_columns = []
for col in df.select_dtypes(include=['object','bool']).columns:
    categorical_columns.append(col)
categorical_columns

['id',
 'channelId',
 'title',
 'description',
 'channelTitle',
 'categoryId',
 'tags',
 'defaultLanguage',
 'defaultAudioLanguage',
 'dimension',
 'definition',
 'caption',
 'licensedContent',
 'regionRestriction.blocked',
 'regionRestriction.allowed',
 'contentRating.ytRating']

In [50]:
#Tính và vẽ chart plotly % null của các cột, sắp xêp theo tỷ lệ null giảm dần
import plotly.express as px

null_counts = df[categorical_columns].isnull().mean().sort_values(ascending=False)
fig = px.bar(null_counts*100, 
             title='Tỷ lệ Null của các cột trong DataFrame', 
             labels={'index': 'Cột', 'value': 'Tỷ lệ Null'},
             text=round(null_counts * 100, 2))
fig.update_layout(xaxis_title='Cột', yaxis_title='Tỷ lệ Null (%)')
fig.show()

In [51]:
#Drop các cột có tỷ lệ null > 75%
threshold = 0.75
columns_to_drop = null_counts[null_counts > threshold].index.tolist()
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [52]:
def detect_language_safe(title):
    try:
        return detect(str(title))
    except:
        return "unknown"

df['defaultLanguage'] = df['defaultLanguage'].fillna(df['description'].apply(detect_language_safe))

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7320 entries, 0 to 7319
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   id                    7320 non-null   object             
 1   publishedAt           7320 non-null   datetime64[ns, UTC]
 2   channelId             7320 non-null   object             
 3   title                 7320 non-null   object             
 4   description           7320 non-null   object             
 5   channelTitle          7320 non-null   object             
 6   categoryId            7320 non-null   object             
 7   tags                  4855 non-null   object             
 8   defaultLanguage       7320 non-null   object             
 9   defaultAudioLanguage  5465 non-null   object             
 10  dimension             7320 non-null   object             
 11  definition            7320 non-null   object             
 12  captio

In [54]:
df.fillna({'defaultAudioLanguage': 'None',  'tags': 'None'}, inplace=True)

In [55]:
#drop rows where duration is null
df.dropna(subset=['duration_minutes'], inplace=True)

In [56]:
numberic_columns = df.select_dtypes(exclude=['bool','object']).columns.tolist()

#Calculate NA percent and draw plotly bar chart
na_percent = df[numberic_columns].isnull().mean().sort_values(ascending=False)
fig = px.bar(na_percent * 100,
                title='Tỷ lệ Null của các cột số trong DataFrame', 
                labels={'index': 'Cột', 'value': 'Tỷ lệ Null'},
                text=round(na_percent * 100, 2))
fig.update_layout(xaxis_title='Cột', yaxis_title='Tỷ lệ Null (%)')
fig.show()


In [57]:
df[df['duration_minutes'] == 0]

Unnamed: 0,id,publishedAt,channelId,title,description,channelTitle,categoryId,tags,defaultLanguage,defaultAudioLanguage,dimension,definition,caption,licensedContent,viewCount,likeCount,commentCount,duration_minutes
1973,zce9XNAdaNY,2025-05-11 21:05:29+00:00,UChELZ_JMGNYuxObfrXoER6A,"Foundation Principles of Generative AI, Part 5","In this episode, you will explore a foundation...",Christopher Penn,27,"[Generative AI, AI Principles, How AI Works, C...",en,en,2d,hd,False,False,0.0,0.0,0.0,0.0
1979,PR0tQ_NstH4,2025-05-11 21:05:13+00:00,UChELZ_JMGNYuxObfrXoER6A,"Foundation Principles of Generative AI, Part 6","In this episode, Christopher Penn reveals a fu...",Christopher Penn,27,"[generative AI, AI prompting, prompting, deleg...",en,en,2d,hd,False,False,0.0,0.0,1.0,0.0
2005,Zzu3qDY-Tog,2025-05-11 21:06:16+00:00,UChELZ_JMGNYuxObfrXoER6A,"Foundation Principles of Generative AI, Part 4","In this episode, Christopher Penn discusses th...",Christopher Penn,27,"[generative AI, AI prompts, prompt length, pro...",en,en,2d,hd,False,False,0.0,0.0,1.0,0.0
2007,lTMeZVzKvsc,2025-05-11 21:04:17+00:00,UChELZ_JMGNYuxObfrXoER6A,"Foundation Principles of Generative AI, Part 9","In this episode, discover the tenth foundation...",Christopher Penn,27,"[AI, Generative AI, AI Models, Artificial Inte...",en,en,2d,hd,False,False,0.0,0.0,0.0,0.0
2049,1LHPrXRT07k,2025-05-01 09:36:12+00:00,UCEcRPnqQeCLyAxpr_R8bygw,كل طرق تعلم الذكاء الاصطناعي: من ML لـ Generat...,هل تعلم إنو الذكاء الاصطناعي بلّش رسميًا سنة 1...,Lara Wehbe,28,"[machine learning, deep learning, reinforcemen...",ar,ar,2d,hd,False,False,0.0,1.0,0.0,0.0
3447,1oDrJba2PSs,2025-01-09 11:48:26+00:00,UCSJ4gkVC6NrvII8umztf0Ow,Study With Me 📚 Pomodoro,"🎼 | Listen on Spotify, Apple music and more\n→...",Lofi Girl,10,"[pomodoro, pomodoro technique, study with me, ...",en,,2d,sd,False,True,2729017.0,23848.0,0.0,0.0
3449,Na0w3Mz46GA,2024-06-02 16:19:22+00:00,UCSJ4gkVC6NrvII8umztf0Ow,asian lofi radio ⛩️ beats to relax/study to,"🎼 | Listen on Spotify, Apple music and more\n→...",Lofi Girl,10,"[chilledcow, chilled cow, lofi, lofi hiphop, l...",en,,2d,sd,False,True,11561564.0,97048.0,0.0,0.0
4701,qd-_03GEjCE,2025-05-12 11:24:35+00:00,UCY-edAci2pVBEsFHSyl_lRg,AI Tools for Everyday Work | Boost Productivit...,@educationnestofficial \nAI Tools for Everyday...,Education Nest,27,,en,,2d,sd,False,True,0.0,0.0,0.0,0.0
5694,8EyMUl-zvVE,2025-04-14 02:51:14+00:00,UCbfENehrv6SRfQLRNZrypcg,Claude AI Explained | The Anthropic Chatbot Ri...,"Welcome to our channel! In this video, we take...",Professor Rahul Jain,27,"[Claude AI, Anthropic Claude, AI chatbot, Chat...",en,,2d,hd,False,False,0.0,0.0,0.0,0.0


In [58]:
#drop rows where duration_minutes is 0
df = df[df['duration_minutes'] > 0]

In [59]:
df['likeCount'] = df['likeCount'].fillna(0).astype(int)
df['commentCount'] = df['commentCount'].fillna(0).astype(int)
df['viewCount'] = df['viewCount'].fillna(0).astype(int)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7311 entries, 0 to 7319
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   id                    7311 non-null   object             
 1   publishedAt           7311 non-null   datetime64[ns, UTC]
 2   channelId             7311 non-null   object             
 3   title                 7311 non-null   object             
 4   description           7311 non-null   object             
 5   channelTitle          7311 non-null   object             
 6   categoryId            7311 non-null   object             
 7   tags                  7311 non-null   object             
 8   defaultLanguage       7311 non-null   object             
 9   defaultAudioLanguage  7311 non-null   object             
 10  dimension             7311 non-null   object             
 11  definition            7311 non-null   object             
 12  caption    

## Kiểm tra phân bố
### Kiểm tra phân vị, histogram với dữ liệu số


In [61]:
# Hiển thị toàn bộ số thập phân (ví dụ: 657015.900 instead of 6.570159e+05)
pd.set_option('display.float_format', '{:.6f}'.format)
df.describe()


Unnamed: 0,viewCount,likeCount,commentCount,duration_minutes
count,7311.0,7311.0,7311.0,7311.0
mean,655870.033511,16938.565996,385.260703,17.148914
std,3665125.179642,97576.375987,1802.174724,82.733213
min,0.0,0.0,0.0,0.033
25%,3411.0,49.0,2.0,0.7
50%,29085.0,630.0,26.0,2.667
75%,183132.0,4196.5,149.0,12.9085
max,104343035.0,2795506.0,45002.0,3948.783


In [62]:
df[df['duration_minutes'] < 1]

Unnamed: 0,id,publishedAt,channelId,title,description,channelTitle,categoryId,tags,defaultLanguage,defaultAudioLanguage,dimension,definition,caption,licensedContent,viewCount,likeCount,commentCount,duration_minutes
0,0MQEf_7qk4s,2024-12-03 17:01:38+00:00,UC0xHrJNbxHmFnUP-49WYM-Q,120 mind blowing AI tools #productivity #aitoo...,,SetupsAI,22,,unknown,,2d,hd,False,True,253586,14833,81,0.133000
1,RZdYlS6zvhY,2024-12-26 04:53:04+00:00,UC8w4I8t2OpqoOpzzNT1c2dg,PAID vs FREE AI Tools - Best Free AI Tools,PAID vs FREE AI Tools - Best Free AI Tools\nAr...,WebbyFan,26,"[ai, ai tools, best ai tools, new ai tools, be...",en-IN,en,2d,hd,False,True,35260,247,6,0.200000
2,GRlarIhf1Rw,2024-08-17 12:24:50+00:00,UC0xHrJNbxHmFnUP-49WYM-Q,Top 10 designer AI tools #ai #productivity #de...,,SetupsAI,22,,unknown,,2d,hd,False,True,703645,31338,85,0.300000
3,MOCPCwGm3Dw,2025-03-20 13:00:25+00:00,UC7geKfz2-IH0rsgRBtHTm0g,3 Best AI tools for market research,Top 3 AI Market Research Tools: Automate Your ...,Learn With Shopify,27,"[AI tools, market research tools, artificial i...",en-US,en-US,2d,hd,False,False,19015,710,5,0.567000
4,ObMnw8woAog,2023-05-18 16:08:40+00:00,UCPkctgt1mTeJWTGj4tq4dPQ,Best 12 AI Tools in 2023,,LKLogic,27,,unknown,,2d,hd,False,True,1985707,122736,440,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7312,IXhmwcMIDCE,2023-05-20 04:48:53+00:00,UCg2Pq_61OYOUjbq8fTA1t3A,Trend mới chỉ dành cho ai có bồ 🤣 #shorts #ti...,"✔ xin chào, Việt Phương Thoa đâyyyyyy \n\n⨳ T...",Việt Phương Thoa,22,"[việt phương thoa, viet phuong thoa, tik tok, ...",vi,vi,2d,hd,False,True,2771323,50432,916,0.200000
7313,DxEQ80o_oew,2024-10-20 05:55:58+00:00,UCCrDVNw6IGhs7Av77W4O7Cg,AI LÀ NGƯỜI GÂY ẤN TƯỢNG NHẤT VỚI BẠN #xuhuong...,WHO IMPRESSES YOU #xuhuong #dance #nhảytrend #...,TikTok Việt Nam Official,24,"[tiktok, tiktok gái xinh, tiktok việt nam, trà...",vi,,2d,hd,False,False,12287866,278649,601,0.217000
7314,61UfcPIlzZs,2023-10-25 13:54:30+00:00,UCnN0pW_aaM3uaRTv5Y2iDpw,Engineering branch which have better scope or ...,,PG Clips,22,,unknown,,2d,hd,False,False,146184,4287,60,0.967000
7317,CkNjj5nJbr8,2024-10-16 12:00:01+00:00,UCCrDVNw6IGhs7Av77W4O7Cg,AI LÀ NGƯỜI GÂY ẤN TƯỢNG MẠNH VỚI BẠN? #xuhuon...,WHICH CHARACTER IMPRESSES YOU? #xuhuong #dance...,TikTok Việt Nam Official,24,"[tiktok, tiktok gái xinh, tiktok việt nam, trà...",vi,,2d,hd,False,False,1292540,42376,424,0.283000


In [63]:
len(df[df['duration_minutes'] < 1])

2927

In [64]:
categorical_columns = df.select_dtypes(include=['object', 'bool']).columns.tolist()
string_columns = ['title', 'description', 'tags','id', 'channelId']
# Loại bỏ các cột không phải là categorical
categorical_columns = [col for col in categorical_columns if col not in string_columns]

#Vẽ histogram cho các cột categorical
for col in categorical_columns:
        fig = px.histogram(df, x=col, title=f'Histogram of {col}')
        fig.show()

Since 'dimensions', 'definition' does not contains a lot of information, and we dont focus on analyze the image quality of video, we can drop it

In [66]:
#drop 'dimensions', 'definition'
df.drop(columns=['dimension', 'definition'], inplace=True)

In [67]:
# Bước 1: Tạo dictionary ánh xạ từ ID sang tên thể loại
category_mapping = {
    '1': "Film & Animation",
    '2': "Autos & Vehicles",
    '10': "Music",
    '15': "Pets & Animals",
    '17': "Sports",
    '19': "Travel & Events",
    '20': "Gaming",
    '22': "People & Blogs",
    '23': "Comedy",
    '24': "Entertainment",
    '25': "News & Politics",
    '26': "Howto & Style",
    '27': "Education",
    '28': "Science & Technology",
    '29': "Nonprofits & Activism"
}

# Bước 2: Thêm cột mới 'categoryName' dựa trên 'categoryId'
df['categoryName'] = df['categoryId'].map(category_mapping)

In [68]:
#add new column 'crawl_date' with '2025-05-14', date time
df['crawl_date'] = pd.to_datetime('2025-05-14')

In [69]:
df.head(5)

Unnamed: 0,id,publishedAt,channelId,title,description,channelTitle,categoryId,tags,defaultLanguage,defaultAudioLanguage,caption,licensedContent,viewCount,likeCount,commentCount,duration_minutes,categoryName,crawl_date
0,0MQEf_7qk4s,2024-12-03 17:01:38+00:00,UC0xHrJNbxHmFnUP-49WYM-Q,120 mind blowing AI tools #productivity #aitoo...,,SetupsAI,22,,unknown,,False,True,253586,14833,81,0.133,People & Blogs,2025-05-14
1,RZdYlS6zvhY,2024-12-26 04:53:04+00:00,UC8w4I8t2OpqoOpzzNT1c2dg,PAID vs FREE AI Tools - Best Free AI Tools,PAID vs FREE AI Tools - Best Free AI Tools\nAr...,WebbyFan,26,"[ai, ai tools, best ai tools, new ai tools, be...",en-IN,en,False,True,35260,247,6,0.2,Howto & Style,2025-05-14
2,GRlarIhf1Rw,2024-08-17 12:24:50+00:00,UC0xHrJNbxHmFnUP-49WYM-Q,Top 10 designer AI tools #ai #productivity #de...,,SetupsAI,22,,unknown,,False,True,703645,31338,85,0.3,People & Blogs,2025-05-14
3,MOCPCwGm3Dw,2025-03-20 13:00:25+00:00,UC7geKfz2-IH0rsgRBtHTm0g,3 Best AI tools for market research,Top 3 AI Market Research Tools: Automate Your ...,Learn With Shopify,27,"[AI tools, market research tools, artificial i...",en-US,en-US,False,False,19015,710,5,0.567,Education,2025-05-14
4,ObMnw8woAog,2023-05-18 16:08:40+00:00,UCPkctgt1mTeJWTGj4tq4dPQ,Best 12 AI Tools in 2023,,LKLogic,27,,unknown,,False,True,1985707,122736,440,0.6,Education,2025-05-14


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7311 entries, 0 to 7319
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   id                    7311 non-null   object             
 1   publishedAt           7311 non-null   datetime64[ns, UTC]
 2   channelId             7311 non-null   object             
 3   title                 7311 non-null   object             
 4   description           7311 non-null   object             
 5   channelTitle          7311 non-null   object             
 6   categoryId            7311 non-null   object             
 7   tags                  7311 non-null   object             
 8   defaultLanguage       7311 non-null   object             
 9   defaultAudioLanguage  7311 non-null   object             
 10  caption               7311 non-null   bool               
 11  licensedContent       7311 non-null   bool               
 12  viewCount  

In [71]:
for col in df.columns:
    if df[col].apply(lambda x: isinstance(x, list)).any():
        df[col] = df[col].apply(lambda x: '; '.join(x) if isinstance(x, list) else x)

In [76]:
#create dataset '2_cleaned_data' and add ad as table 'video_basic_info'
today = pd.Timestamp.now().strftime('%Y%m%d')
dataset_id = "2_cleaned_data"
table_id = f"video_basic_info"

# Create dataset
full_dataset_id = f"{project_id}.{dataset_id}"
client = bigquery.Client(project=project_id)
dataset = bigquery.Dataset(full_dataset_id)
dataset.location = "asia-southeast1"

try:
    client.create_dataset(dataset, timeout=30)
    print(f"✅ Dataset '{full_dataset_id}' created.")
except Exception as e:
    print(f"⚠️ Failed to create dataset: {e}")


Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. 



⚠️ Failed to create dataset: 409 POST https://bigquery.googleapis.com/bigquery/v2/projects/ytbdataanalyst/datasets?prettyPrint=false: Already Exists: Dataset ytbdataanalyst:2_cleaned_data


In [77]:
# Load config
table_ref = f"{project_id}.{dataset_id}.{table_id}"

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,  # Use WRITE_TRUNCATE to overwrite
    autodetect=True  # Let BigQuery detect schema from DataFrame
)

# Upload the DataFrame
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
job.result()  # Wait for the job to complete

print(f"✅ Data uploaded to BigQuery table: {table_id}")


Loading pandas DataFrame into BigQuery will require pandas-gbq package version 0.26.1 or greater in the future. Tried to import pandas-gbq and got: No module named 'pandas_gbq'



✅ Data uploaded to BigQuery table: video_basic_info
